This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
This patch series implements a write function for the 'cgroup.procs'
per-cgroup file, which enables atomic movement of multithreaded
applications between cgroups. Writing the thread-ID of any thread in a
threadgroup to a cgroup's procs file causes all threads in the group to
be moved to that cgroup safely with respect to threads forking/exiting.
(Possible usage scenario: If running a multithreaded build system that
sucks up system resources, this lets you restrict it all at once into a
new cgroup to keep it under control.)
Example: Suppose pid 31337 clones new threads 31338 and 31339.
# cat /dev/cgroup/tasks
...
31337
31338
31339
# mkdir /dev/cgroup/foo
# echo 31337 > /dev/cgroup/foo/cgroup.procs
# cat /dev/cgroup/foo/tasks
31337
31338
31339
A new lock, called threadgroup_fork_lock and living in signal_struct, is
introduced to ensure atomicity when moving threads between cgroups. It's
taken for writing during the operation, and taking for reading in fork()
around the calls to cgroup_fork() and cgroup_post_fork(). I put calls to
down_read/up_read directly in copy_process(), since new inline functions
seemed like overkill.
-- Ben
---
Documentation/cgroups/cgroups.txt | 13 -
include/linux/init_task.h | 9
include/linux/sched.h | 10
kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++-----
kernel/cgroup_freezer.c | 4
kernel/cpuset.c | 4
kernel/fork.c | 16 +
kernel/ns_cgroup.c | 4
kernel/sched.c | 4
9 files changed, 440 insertions(+), 50 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroups-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 10 ++++++++++
kernel/fork.c | 16 ++++++++++++++++
3 files changed, 35 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f43fa5..ca46711 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -29,6 +37,7 @@ extern struct fs_struct init_fs;
.running = 0, \
.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
}, \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae69716..82b0bcf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -619,6 +619,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
};
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65c..a9bce89 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
return 0;
@@ -1076,6 +1080,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+#ifdef CONFIG_CGROUPS
+ if (clone_flags & CLONE_THREAD)
+ down_read(¤t->signal->threadgroup_fork_lock);
+#endif
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1283,6 +1291,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+#ifdef CONFIG_CGROUPS
+ if (clone_flags & CLONE_THREAD)
+ up_read(¤t->signal->threadgroup_fork_lock);
+#endif
perf_event_fork(p);
return p;
@@ -1316,6 +1328,10 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+#ifdef CONFIG_CGROUPS
+ if (clone_flags & CLONE_THREAD)
+ up_read(¤t->signal->threadgroup_fork_lock);
+#endif
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 13 +
kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++----
kernel/cgroup_freezer.c | 4
kernel/cpuset.c | 4
kernel/ns_cgroup.c | 4
kernel/sched.c | 4
6 files changed, 405 insertions(+), 50 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index b34823f..5f3c707 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -235,7 +235,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -416,6 +417,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
@@ -564,7 +571,9 @@ called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
attach() or cancel_attach() will be called in future. If threadgroup is
true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+thread's threadgroup can be moved together. If the subsystem wants to
+iterate over task->thread_group, it must take rcu_read_lock then check
+if thread_group_leader(task), returning -EAGAIN if that fails.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f91d7dd..fab8c87 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1688,6 +1688,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exit. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1698,11 +1768,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1726,46 +1794,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1791,49 +1829,341 @@ out:
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /* check that we can legitimately attach to the cgroup. */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader, true);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ goto list_teardown;
+ }
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case has only one css_set for all of them, usually O(n). which ones
+ * we need allocated won't change as long as we hold cgroup_mutex.
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. we need to lock against possible
+ * races with fork(). note: we can safely access leader->signal because
+ * attach_task_by_pid takes a reference on leader, which guarantees that
+ * the signal_struct will stick around. threadgroup_fork_lock must be
+ * taken outside of tasklist_lock to match the order in the fork path.
+ */
+ BUG_ON(!leader->signal);
+ down_write(&leader->signal->threadgroup_fork_lock);
+ read_lock(&tasklist_lock);
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ read_unlock(&tasklist_lock);
+ up_write(&leader->signal->threadgroup_fork_lock);
+ goto list_teardown;
+ }
+ /*
+ * No failure cases left, so this is the commit point.
+ *
+ * If the leader is already there, skip moving him. Note: even if the
+ * leader is PF_EXITING, we still move all other threads; if everybody
+ * is PF_EXITING, we end up doing nothing, which is ok.
+ */
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* Now iterate over each thread in the group. */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ BUG_ON(tsk->signal != leader->signal);
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+
+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader, true);
+ }
+ /* holding these until here keeps us safe from exec() and fork(). */
+ read_unlock(&tasklist_lock);
+ up_write(&leader->signal->threadgroup_fork_lock);
+
+ /*
+ * step 4: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out:
+ if (retval) {
+ /* same deal as in cgroup_attach_task, with threadgroup=true */
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader, true);
+ }
+ }
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * it is safe to find group_leader because tsk was found
+ * in the tid map, meaning it can't have been unhashed
+ * by someone in de_thread changing the leadership.
+ */
+ tsk = tsk->group_leader;
+ BUG_ON(!thread_group_leader(tsk));
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup)
+ ret = cgroup_attach_proc(cgrp, tsk);
+ else
+ ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3168,9 +3498,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed5..daf0249 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -190,6 +190,10 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
struct task_struct *c;
rcu_read_lock();
+ if (!thread_group_leader(task)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (is_task_frozen_enough(c)) {
rcu_read_unlock();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c097..3d7c978 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1404,6 +1404,10 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct task_struct *c;
rcu_read_lock();
+ if (!thread_group_leader(tsk)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
ret = security_task_setscheduler(c, 0, NULL);
if (ret) {
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec..ecd15d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -59,6 +59,10 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
+ if (!thread_group_leader(task)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
if (!cgroup_is_descendant(new_cgroup, c)) {
rcu_read_unlock();
diff --git a/kernel/sched.c b/kernel/sched.c
index 70fa78d..df53f53 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8721,6 +8721,10 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
+ if (!thread_group_leader(tsk)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
retval = cpu_cgroup_can_attach_task(cgrp, c);
if (retval) {
On Fri, 30 Jul 2010 19:56:49 -0400
Ben Blum <[email protected]> wrote:
> This patch series implements a write function for the 'cgroup.procs'
> per-cgroup file, which enables atomic movement of multithreaded
> applications between cgroups. Writing the thread-ID of any thread in a
> threadgroup to a cgroup's procs file causes all threads in the group to
> be moved to that cgroup safely with respect to threads forking/exiting.
> (Possible usage scenario: If running a multithreaded build system that
> sucks up system resources, this lets you restrict it all at once into a
> new cgroup to keep it under control.)
I can see how that would be useful. No comments from anyone else?
patch 1/2 makes me cry with all those ifdefs. Maybe helper functions
would help, but not a lot.
patch 2/2 looks very complicated.
On Tue, 3 Aug 2010 12:58:27 -0700
Andrew Morton <[email protected]> wrote:
> On Fri, 30 Jul 2010 19:56:49 -0400
> Ben Blum <[email protected]> wrote:
>
> > This patch series implements a write function for the 'cgroup.procs'
> > per-cgroup file, which enables atomic movement of multithreaded
> > applications between cgroups. Writing the thread-ID of any thread in a
> > threadgroup to a cgroup's procs file causes all threads in the group to
> > be moved to that cgroup safely with respect to threads forking/exiting.
> > (Possible usage scenario: If running a multithreaded build system that
> > sucks up system resources, this lets you restrict it all at once into a
> > new cgroup to keep it under control.)
>
> I can see how that would be useful. No comments from anyone else?
>
I think the feature itself is good and useful. I welcome this.
> patch 1/2 makes me cry with all those ifdefs. Maybe helper functions
> would help, but not a lot.
>
Add static inline functions ?
> patch 2/2 looks very complicated.
yes. that's a concern.
I'd like to look deeper, today.
Thanks,
-Kame
On Fri, 30 Jul 2010 19:59:02 -0400
Ben Blum <[email protected]> wrote:
> Makes procs file writable to move all threads by tgid at once
>
> From: Ben Blum <[email protected]>
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a per-threadgroup rwsem that's
> taken for reading in the fork() path to prevent newly forking threads within
> the threadgroup from "escaping" while the move is in progress.
>
> Signed-off-by: Ben Blum <[email protected]>
> ---
> Documentation/cgroups/cgroups.txt | 13 +
> kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++----
> kernel/cgroup_freezer.c | 4
> kernel/cpuset.c | 4
> kernel/ns_cgroup.c | 4
> kernel/sched.c | 4
> 6 files changed, 405 insertions(+), 50 deletions(-)
>
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index b34823f..5f3c707 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -235,7 +235,8 @@ containing the following files describing that cgroup:
> - cgroup.procs: list of tgids in the cgroup. This list is not
> guaranteed to be sorted or free of duplicate tgids, and userspace
> should sort/uniquify the list if this property is required.
> - This is a read-only file, for now.
> + Writing a thread group id into this file moves all threads in that
> + group into this cgroup.
> - notify_on_release flag: run the release agent on exit?
> - release_agent: the path to use for release notifications (this file
> exists in the top cgroup only)
> @@ -416,6 +417,12 @@ You can attach the current shell task by echoing 0:
>
> # echo 0 > tasks
>
> +You can use the cgroup.procs file instead of the tasks file to move all
> +threads in a threadgroup at once. Echoing the pid of any task in a
> +threadgroup to cgroup.procs causes all tasks in that threadgroup to be
> +be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
> +in the writing task's threadgroup.
> +
> 2.3 Mounting hierarchies by name
> --------------------------------
>
> @@ -564,7 +571,9 @@ called on a fork. If this method returns 0 (success) then this should
> remain valid while the caller holds cgroup_mutex and it is ensured that either
> attach() or cancel_attach() will be called in future. If threadgroup is
> true, then a successful result indicates that all threads in the given
> -thread's threadgroup can be moved together.
> +thread's threadgroup can be moved together. If the subsystem wants to
> +iterate over task->thread_group, it must take rcu_read_lock then check
> +if thread_group_leader(task), returning -EAGAIN if that fails.
>
> void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> struct task_struct *task, bool threadgroup)
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index f91d7dd..fab8c87 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1688,6 +1688,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
> }
> EXPORT_SYMBOL_GPL(cgroup_path);
>
> +/*
> + * cgroup_task_migrate - move a task from one cgroup to another.
> + *
> + * 'guarantee' is set if the caller promises that a new css_set for the task
> + * will already exit. If not set, this function might sleep, and can fail with
already exist ?
> + * -ENOMEM. Otherwise, it can only fail with -ESRCH.
> + */
> +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
> + struct task_struct *tsk, bool guarantee)
> +{
> + struct css_set *oldcg;
> + struct css_set *newcg;
> +
> + /*
> + * get old css_set. we need to take task_lock and refcount it, because
> + * an exiting task can change its css_set to init_css_set and drop its
> + * old one without taking cgroup_mutex.
> + */
> + task_lock(tsk);
> + oldcg = tsk->cgroups;
> + get_css_set(oldcg);
> + task_unlock(tsk);
> +
> + /* locate or allocate a new css_set for this task. */
> + if (guarantee) {
> + /* we know the css_set we want already exists. */
> + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> + read_lock(&css_set_lock);
> + newcg = find_existing_css_set(oldcg, cgrp, template);
> + BUG_ON(!newcg);
> + get_css_set(newcg);
> + read_unlock(&css_set_lock);
> + } else {
> + might_sleep();
> + /* find_css_set will give us newcg already referenced. */
> + newcg = find_css_set(oldcg, cgrp);
> + if (!newcg) {
> + put_css_set(oldcg);
> + return -ENOMEM;
> + }
> + }
> + put_css_set(oldcg);
> +
> + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
> + task_lock(tsk);
> + if (tsk->flags & PF_EXITING) {
> + task_unlock(tsk);
> + put_css_set(newcg);
> + return -ESRCH;
> + }
> + rcu_assign_pointer(tsk->cgroups, newcg);
> + task_unlock(tsk);
> +
> + /* Update the css_set linked lists if we're using them */
> + write_lock(&css_set_lock);
> + if (!list_empty(&tsk->cg_list))
> + list_move(&tsk->cg_list, &newcg->tasks);
> + write_unlock(&css_set_lock);
> +
> + /*
> + * We just gained a reference on oldcg by taking it from the task. As
> + * trading it for newcg is protected by cgroup_mutex, we're safe to drop
> + * it here; it will be freed under RCU.
> + */
> + put_css_set(oldcg);
> +
> + set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> + return 0;
> +}
> +
> /**
> * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
> * @cgrp: the cgroup the task is attaching to
> @@ -1698,11 +1768,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
> */
> int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> {
> - int retval = 0;
> + int retval;
> struct cgroup_subsys *ss, *failed_ss = NULL;
> struct cgroup *oldcgrp;
> - struct css_set *cg;
> - struct css_set *newcg;
> struct cgroupfs_root *root = cgrp->root;
>
> /* Nothing to do if the task is already in that cgroup */
> @@ -1726,46 +1794,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> }
> }
>
> - task_lock(tsk);
> - cg = tsk->cgroups;
> - get_css_set(cg);
> - task_unlock(tsk);
> - /*
> - * Locate or allocate a new css_set for this task,
> - * based on its final set of cgroups
> - */
> - newcg = find_css_set(cg, cgrp);
> - put_css_set(cg);
> - if (!newcg) {
> - retval = -ENOMEM;
> - goto out;
> - }
> -
> - task_lock(tsk);
> - if (tsk->flags & PF_EXITING) {
> - task_unlock(tsk);
> - put_css_set(newcg);
> - retval = -ESRCH;
> + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
> + if (retval)
> goto out;
> - }
> - rcu_assign_pointer(tsk->cgroups, newcg);
> - task_unlock(tsk);
> -
> - /* Update the css_set linked lists if we're using them */
> - write_lock(&css_set_lock);
> - if (!list_empty(&tsk->cg_list)) {
> - list_del(&tsk->cg_list);
> - list_add(&tsk->cg_list, &newcg->tasks);
> - }
> - write_unlock(&css_set_lock);
>
> for_each_subsys(root, ss) {
> if (ss->attach)
> ss->attach(ss, cgrp, oldcgrp, tsk, false);
> }
> - set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> +
Hmm. By this, we call ss->attach(ss, cgrp, oldcgrp, tsk, false) after
makring CGRP_RELEASABLE+synchronize_rcu() to oldcgroup...is it safe ?
And why move it before attach() ?
> synchronize_rcu();
> - put_css_set(cg);
>
> /*
> * wake up rmdir() waiter. the rmdir should fail since the cgroup
> @@ -1791,49 +1829,341 @@ out:
> }
>
> /*
> - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
> - * held. May take task_lock of task
> + * cgroup_attach_proc works in two stages, the first of which prefetches all
> + * new css_sets needed (to make sure we have enough memory before committing
> + * to the move) and stores them in a list of entries of the following type.
> + * TODO: possible optimization: use css_set->rcu_head for chaining instead
> + */
> +struct cg_list_entry {
> + struct css_set *cg;
> + struct list_head links;
> +};
> +
> +static bool css_set_check_fetched(struct cgroup *cgrp,
> + struct task_struct *tsk, struct css_set *cg,
> + struct list_head *newcg_list)
> +{
> + struct css_set *newcg;
> + struct cg_list_entry *cg_entry;
> + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> +
> + read_lock(&css_set_lock);
> + newcg = find_existing_css_set(cg, cgrp, template);
> + if (newcg)
> + get_css_set(newcg);
> + read_unlock(&css_set_lock);
> +
> + /* doesn't exist at all? */
> + if (!newcg)
> + return false;
> + /* see if it's already in the list */
> + list_for_each_entry(cg_entry, newcg_list, links) {
> + if (cg_entry->cg == newcg) {
> + put_css_set(newcg);
> + return true;
> + }
> + }
> +
> + /* not found */
> + put_css_set(newcg);
> + return false;
> +}
> +
> +/*
> + * Find the new css_set and store it in the list in preparation for moving the
> + * given task to the given cgroup. Returns 0 or -ENOMEM.
> */
> -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
> +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> + struct list_head *newcg_list)
> +{
> + struct css_set *newcg;
> + struct cg_list_entry *cg_entry;
> +
> + /* ensure a new css_set will exist for this thread */
> + newcg = find_css_set(cg, cgrp);
> + if (!newcg)
> + return -ENOMEM;
> + /* add it to the list */
> + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
> + if (!cg_entry) {
> + put_css_set(newcg);
> + return -ENOMEM;
> + }
> + cg_entry->cg = newcg;
> + list_add(&cg_entry->links, newcg_list);
> + return 0;
> +}
> +
> +/**
> + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
> + * @cgrp: the cgroup to attach to
> + * @leader: the threadgroup leader task_struct of the group to be attached
> + *
> + * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
> + * threadgroup individually in turn.
> + */
> +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> +{
> + int retval;
> + struct cgroup_subsys *ss, *failed_ss = NULL;
> + struct cgroup *oldcgrp;
> + struct css_set *oldcg;
> + struct cgroupfs_root *root = cgrp->root;
> + /* threadgroup list cursor */
> + struct task_struct *tsk;
> + /*
> + * we need to make sure we have css_sets for all the tasks we're
> + * going to move -before- we actually start moving them, so that in
> + * case we get an ENOMEM we can bail out before making any changes.
> + */
> + struct list_head newcg_list;
> + struct cg_list_entry *cg_entry, *temp_nobe;
> +
> + /* check that we can legitimately attach to the cgroup. */
> + for_each_subsys(root, ss) {
> + if (ss->can_attach) {
> + retval = ss->can_attach(ss, cgrp, leader, true);
> + if (retval) {
> + failed_ss = ss;
> + goto out;
> + }
> + }
> + }
Then, we cannot do attach limitaion control per thread ? (This just check leader.)
Is it ok for all subsys ?
> +
> + /*
> + * step 1: make sure css_sets exist for all threads to be migrated.
> + * we use find_css_set, which allocates a new one if necessary.
> + */
> + INIT_LIST_HEAD(&newcg_list);
> + oldcgrp = task_cgroup_from_root(leader, root);
> + if (cgrp != oldcgrp) {
> + /* get old css_set */
> + task_lock(leader);
> + if (leader->flags & PF_EXITING) {
> + task_unlock(leader);
> + goto prefetch_loop;
> + }
Why do we continue here ? not -ESRCH ?
> + oldcg = leader->cgroups;
> + get_css_set(oldcg);
> + task_unlock(leader);
> + /* acquire new one */
> + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> + put_css_set(oldcg);
> + if (retval)
> + goto list_teardown;
> + }
> +prefetch_loop:
> + rcu_read_lock();
> + /* sanity check - if we raced with de_thread, we must abort */
> + if (!thread_group_leader(leader)) {
> + retval = -EAGAIN;
> + goto list_teardown;
> + }
EAGAIN ? ESRCH ? or EBUSY ?
> + /*
> + * if we need to fetch a new css_set for this task, we must exit the
> + * rcu_read section because allocating it can sleep. afterwards, we'll
> + * need to restart iteration on the threadgroup list - the whole thing
> + * will be O(nm) in the number of threads and css_sets; as the typical
> + * case has only one css_set for all of them, usually O(n). which ones
> + * we need allocated won't change as long as we hold cgroup_mutex.
> + */
> + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> + /* nothing to do if this task is already in the cgroup */
> + oldcgrp = task_cgroup_from_root(tsk, root);
> + if (cgrp == oldcgrp)
> + continue;
> + /* get old css_set pointer */
> + task_lock(tsk);
> + if (tsk->flags & PF_EXITING) {
> + /* ignore this task if it's going away */
> + task_unlock(tsk);
It's going away but seems to exist for a while....then, "continue" is safe
for keeping consistency ?
> + continue;
> + }
> + oldcg = tsk->cgroups;
> + get_css_set(oldcg);
> + task_unlock(tsk);
> + /* see if the new one for us is already in the list? */
> + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
> + /* was already there, nothing to do. */
> + put_css_set(oldcg);
> + } else {
> + /* we don't already have it. get new one. */
> + rcu_read_unlock();
> + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> + put_css_set(oldcg);
> + if (retval)
> + goto list_teardown;
> + /* begin iteration again. */
> + goto prefetch_loop;
Hmm ? Why do we need to restart from the 1st entry ?
(maybe because of rcu_read_unlock() ?)
Does this function work well if the process has 10000+ threads ?
How about this logic ?
==
/* At first, find out necessary things */
rcu_read_lock();
list_for_each_entry_rcu() {
oldcgrp = task_cgroup_from_root(tsk, root);
if (oldcgrp == cgrp)
continue;
task_lock(task);
if (task->flags & PF_EXITING) {
task_unlock(task);
continue;
}
oldcg = tsk->cgroups;
get_css_set(oldcg);
task_unlock(task);
read_lock(&css_set_lock);
newcg = find_existing_css_set(oldcgrp cgrp, template);
if (newcg)
remember_this_newcg(newcg, &found_cg_array); {
put_css_set(oldcg);
} else
remember_need_to_allocate(oldcg, &need_to_allocate_array);
}
rcu_read_unlock();
/* Sort all cg_list found and drop doubly counted ones, drop refcnt if necessary */
sort_and_unique(found_cg_array);
/* Sort all cg_list not found and drop doubly counted ones, drop refcnt if necessary */
sort_and_unique(need_to_allocate_array);
/* Allocate new ones */
newly_allocated_array = allocate_new_cg_lists(need_to_allocate_array);
drop_refcnt_of_old_cgs(need_to_allocate_array);
/* Now we have all necessary cg_list */
> + }
> + }
> + rcu_read_unlock();
> +
> + /*
> + * step 2: now that we're guaranteed success wrt the css_sets, proceed
> + * to move all tasks to the new cgroup. we need to lock against possible
> + * races with fork(). note: we can safely access leader->signal because
> + * attach_task_by_pid takes a reference on leader, which guarantees that
> + * the signal_struct will stick around. threadgroup_fork_lock must be
> + * taken outside of tasklist_lock to match the order in the fork path.
> + */
> + BUG_ON(!leader->signal);
> + down_write(&leader->signal->threadgroup_fork_lock);
> + read_lock(&tasklist_lock);
> + /* sanity check - if we raced with de_thread, we must abort */
> + if (!thread_group_leader(leader)) {
> + retval = -EAGAIN;
> + read_unlock(&tasklist_lock);
> + up_write(&leader->signal->threadgroup_fork_lock);
> + goto list_teardown;
> + }
> + /*
> + * No failure cases left, so this is the commit point.
> + *
> + * If the leader is already there, skip moving him. Note: even if the
> + * leader is PF_EXITING, we still move all other threads; if everybody
> + * is PF_EXITING, we end up doing nothing, which is ok.
> + */
> + oldcgrp = task_cgroup_from_root(leader, root);
> + if (cgrp != oldcgrp) {
> + retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
> + BUG_ON(retval != 0 && retval != -ESRCH);
> + }
> + /* Now iterate over each thread in the group. */
> + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> + BUG_ON(tsk->signal != leader->signal);
> + /* leave current thread as it is if it's already there */
> + oldcgrp = task_cgroup_from_root(tsk, root);
> + if (cgrp == oldcgrp)
> + continue;
> + /* we don't care whether these threads are exiting */
> + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
> + BUG_ON(retval != 0 && retval != -ESRCH);
> + }
> +
> + /*
> + * step 3: attach whole threadgroup to each subsystem
> + * TODO: if ever a subsystem needs to know the oldcgrp for each task
> + * being moved, this call will need to be reworked to communicate that.
> + */
> + for_each_subsys(root, ss) {
> + if (ss->attach)
> + ss->attach(ss, cgrp, oldcgrp, leader, true);
> + }
> + /* holding these until here keeps us safe from exec() and fork(). */
> + read_unlock(&tasklist_lock);
> + up_write(&leader->signal->threadgroup_fork_lock);
> +
> + /*
> + * step 4: success! and cleanup
> + */
> + synchronize_rcu();
> + cgroup_wakeup_rmdir_waiter(cgrp);
> + retval = 0;
> +list_teardown:
> + /* clean up the list of prefetched css_sets. */
> + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
> + list_del(&cg_entry->links);
> + put_css_set(cg_entry->cg);
> + kfree(cg_entry);
> + }
> +out:
> + if (retval) {
> + /* same deal as in cgroup_attach_task, with threadgroup=true */
> + for_each_subsys(root, ss) {
> + if (ss == failed_ss)
> + break;
> + if (ss->cancel_attach)
> + ss->cancel_attach(ss, cgrp, leader, true);
> + }
> + }
> + return retval;
> +}
> +
> +/*
> + * Find the task_struct of the task to attach by vpid and pass it along to the
> + * function to attach either it or all tasks in its threadgroup. Will take
> + * cgroup_mutex; may take task_lock of task.
> + */
> +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
> {
> struct task_struct *tsk;
> const struct cred *cred = current_cred(), *tcred;
> int ret;
>
> + if (!cgroup_lock_live_group(cgrp))
> + return -ENODEV;
> +
> if (pid) {
> rcu_read_lock();
> tsk = find_task_by_vpid(pid);
> - if (!tsk || tsk->flags & PF_EXITING) {
> + if (!tsk) {
> + rcu_read_unlock();
> + cgroup_unlock();
> + return -ESRCH;
> + }
> + if (threadgroup) {
> + /*
> + * it is safe to find group_leader because tsk was found
> + * in the tid map, meaning it can't have been unhashed
> + * by someone in de_thread changing the leadership.
> + */
> + tsk = tsk->group_leader;
> + BUG_ON(!thread_group_leader(tsk));
> + } else if (tsk->flags & PF_EXITING) {
> + /* optimization for the single-task-only case */
> rcu_read_unlock();
> + cgroup_unlock();
> return -ESRCH;
> }
>
> + /*
> + * even if we're attaching all tasks in the thread group, we
> + * only need to check permissions on one of them.
> + */
> tcred = __task_cred(tsk);
> if (cred->euid &&
> cred->euid != tcred->uid &&
> cred->euid != tcred->suid) {
> rcu_read_unlock();
> + cgroup_unlock();
> return -EACCES;
> }
> get_task_struct(tsk);
> rcu_read_unlock();
> } else {
> - tsk = current;
> + if (threadgroup)
> + tsk = current->group_leader;
> + else
I'm not sure but "group_leader" is safe to access here ?
> + tsk = current;
> get_task_struct(tsk);
> }
>
> - ret = cgroup_attach_task(cgrp, tsk);
> + if (threadgroup)
> + ret = cgroup_attach_proc(cgrp, tsk);
> + else
> + ret = cgroup_attach_task(cgrp, tsk);
> put_task_struct(tsk);
> + cgroup_unlock();
> return ret;
> }
>
> static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
> {
> + return attach_task_by_pid(cgrp, pid, false);
> +}
> +
> +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
> +{
> int ret;
> - if (!cgroup_lock_live_group(cgrp))
> - return -ENODEV;
> - ret = attach_task_by_pid(cgrp, pid);
> - cgroup_unlock();
> + do {
> + /*
> + * attach_proc fails with -EAGAIN if threadgroup leadership
> + * changes in the middle of the operation, in which case we need
> + * to find the task_struct for the new leader and start over.
> + */
> + ret = attach_task_by_pid(cgrp, tgid, true);
> + } while (ret == -EAGAIN);
> return ret;
> }
>
> @@ -3168,9 +3498,9 @@ static struct cftype files[] = {
> {
> .name = CGROUP_FILE_GENERIC_PREFIX "procs",
> .open = cgroup_procs_open,
> - /* .write_u64 = cgroup_procs_write, TODO */
> + .write_u64 = cgroup_procs_write,
> .release = cgroup_pidlist_release,
> - .mode = S_IRUGO,
> + .mode = S_IRUGO | S_IWUSR,
> },
> {
> .name = "notify_on_release",
> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> index ce71ed5..daf0249 100644
> --- a/kernel/cgroup_freezer.c
> +++ b/kernel/cgroup_freezer.c
> @@ -190,6 +190,10 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
> struct task_struct *c;
>
> rcu_read_lock();
> + if (!thread_group_leader(task)) {
> + rcu_read_unlock();
> + return -EAGAIN;
> + }
> list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> if (is_task_frozen_enough(c)) {
> rcu_read_unlock();
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index b23c097..3d7c978 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1404,6 +1404,10 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> struct task_struct *c;
>
> rcu_read_lock();
> + if (!thread_group_leader(tsk)) {
> + rcu_read_unlock();
> + return -EAGAIN;
> + }
> list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> ret = security_task_setscheduler(c, 0, NULL);
> if (ret) {
> diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
> index 2a5dfec..ecd15d2 100644
> --- a/kernel/ns_cgroup.c
> +++ b/kernel/ns_cgroup.c
> @@ -59,6 +59,10 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
> if (threadgroup) {
> struct task_struct *c;
> rcu_read_lock();
> + if (!thread_group_leader(task)) {
> + rcu_read_unlock();
> + return -EAGAIN;
> + }
> list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> if (!cgroup_is_descendant(new_cgroup, c)) {
> rcu_read_unlock();
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 70fa78d..df53f53 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -8721,6 +8721,10 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> if (threadgroup) {
> struct task_struct *c;
> rcu_read_lock();
> + if (!thread_group_leader(tsk)) {
> + rcu_read_unlock();
> + return -EAGAIN;
> + }
> list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> retval = cpu_cgroup_can_attach_task(cgrp, c);
> if (retval) {
Thanks,
-Kame
Andrew Morton wrote:
> On Fri, 30 Jul 2010 19:56:49 -0400
> Ben Blum <[email protected]> wrote:
>
>> This patch series implements a write function for the 'cgroup.procs'
>> per-cgroup file, which enables atomic movement of multithreaded
>> applications between cgroups. Writing the thread-ID of any thread in a
>> threadgroup to a cgroup's procs file causes all threads in the group to
>> be moved to that cgroup safely with respect to threads forking/exiting.
>> (Possible usage scenario: If running a multithreaded build system that
>> sucks up system resources, this lets you restrict it all at once into a
>> new cgroup to keep it under control.)
>
> I can see how that would be useful. No comments from anyone else?
>
Oleg had been commenting on this patchset, so it would be nice to know
if he's comfortable with the changes in this version.
On Fri, Jul 30, 2010 at 4:57 PM, Ben Blum <[email protected]> wrote:
> + * The threadgroup_fork_lock prevents threads from forking with
> + * CLONE_THREAD while held for writing. Use this for fork-sensitive
> + * threadgroup-wide operations. It's taken for reading in fork.c in
> + * copy_process().
> + * Currently only needed write-side by cgroups.
> + */
> + struct rw_semaphore threadgroup_fork_lock;
> +#endif
I'm not sure how best to word this comment, but I'd prefer something like:
"The threadgroup_fork_lock is taken in read mode during a CLONE_THREAD
fork operation; taking it in write mode prevents the owning
threadgroup from adding any new threads and thus allows you to
synchronize against the addition of unseen threads when performing
threadgroup-wide operations. New-process forks (without CLONE_THREAD)
are not affected."
As far as the #ifdef mess goes, it's true that some people don't have
CONFIG_CGROUPS defined. I'd imagine that these are likely to be
embedded systems with a fairly small number of processes and threads
per process. Are there really any such platforms where the cost of a
single extra rwsem per process is going to make a difference either in
terms of memory or lock contention? I think you should consider making
these additions unconditional.
Paul
Hi Ben, Kame,
Sorry for the delay in getting to look at this,
On Tue, Aug 3, 2010 at 6:08 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
>>
>> ? ? ? for_each_subsys(root, ss) {
>> ? ? ? ? ? ? ? if (ss->attach)
>> ? ? ? ? ? ? ? ? ? ? ? ss->attach(ss, cgrp, oldcgrp, tsk, false);
>> ? ? ? }
>> - ? ? set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
>> +
>
> Hmm. By this, we call ss->attach(ss, cgrp, oldcgrp, tsk, false) after
> makring CGRP_RELEASABLE+synchronize_rcu() to oldcgroup...is it safe ?
> And why move it before attach() ?
>
Marking as releasable should be fine - the only time this is cleared
is when you write to notify_on_release.
I think that the put_css_set(oldcg) and synchronize_rcu() is safe, by
the following logic:
- we took cgroup_lock in cgroup_tasks_write()
- after this point, oldcgrp was initialized from the task's cgroup;
therefore oldcgrp still existed at this point
- even if all the threads (including the one being operated on) exit
(and hence leave oldcgrp) while we're doing the attach, we're holding
cgroup_lock so no-one else can delete oldcgrp
So it's certainly possible that the task in question has exited by the
time we call the subsys attach methods, but oldcgrp should still be
alive.
Whether we need an additional synchronize_rcu() after the attach()
calls is harder to determine - I guess it's better to be safe than
sorry, unless people are seeing specific performance issues with this.
I think the css_set_check_fetched() function needs more comments
explaining its behaviour and what its return value indicates.
>> + ? ? /*
>> + ? ? ?* we need to make sure we have css_sets for all the tasks we're
>> + ? ? ?* going to move -before- we actually start moving them, so that in
>> + ? ? ?* case we get an ENOMEM we can bail out before making any changes.
More than that - even if we don't get an ENOMEM, we can't safely sleep
in the RCU section, so we'd either have to do all memory allocations
atomically (which would be bad and unreliable) or else we avoid the
need to allocate in the RCU section (which is the choice taken here).
>> + ? ? ?*/
>> + ? ? struct list_head newcg_list;
>> + ? ? struct cg_list_entry *cg_entry, *temp_nobe;
>> +
>> + ? ? /* check that we can legitimately attach to the cgroup. */
>> + ? ? for_each_subsys(root, ss) {
>> + ? ? ? ? ? ? if (ss->can_attach) {
>> + ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, leader, true);
>> + ? ? ? ? ? ? ? ? ? ? if (retval) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? }
>> + ? ? }
>
> Then, we cannot do attach limitaion control per thread ? (This just check leader.)
> Is it ok for all subsys ?
By passing "true" as the "threadgroup" parameter to can_attach(),
we're letting the subsystem decide if it needs to do per-thread
checks. For most subsystems, calling them once for each thread would
be unnecessary.
>
>
>> +
>> + ? ? /*
>> + ? ? ?* step 1: make sure css_sets exist for all threads to be migrated.
>> + ? ? ?* we use find_css_set, which allocates a new one if necessary.
>> + ? ? ?*/
>> + ? ? INIT_LIST_HEAD(&newcg_list);
>> + ? ? oldcgrp = task_cgroup_from_root(leader, root);
>> + ? ? if (cgrp != oldcgrp) {
>> + ? ? ? ? ? ? /* get old css_set */
>> + ? ? ? ? ? ? task_lock(leader);
>> + ? ? ? ? ? ? if (leader->flags & PF_EXITING) {
>> + ? ? ? ? ? ? ? ? ? ? task_unlock(leader);
>> + ? ? ? ? ? ? ? ? ? ? goto prefetch_loop;
>> + ? ? ? ? ? ? }
> Why do we continue here ? not -ESRCH ?
>
It's possible that some threads from the process are exiting while
we're trying to move the entire process. As long as we move at least
one thread, we shouldn't care if some of its threads are exiting.
Which means that after we've done the prefetch loop, we should
probably check that the newcg_list isn't empty, and return -ESRCH in
that case.
>
>> + ? ? ? ? ? ? oldcg = leader->cgroups;
>> + ? ? ? ? ? ? get_css_set(oldcg);
>> + ? ? ? ? ? ? task_unlock(leader);
>> + ? ? ? ? ? ? /* acquire new one */
/* acquire a new css_set for the leader */
>> + ? ? ?* if we need to fetch a new css_set for this task, we must exit the
>> + ? ? ?* rcu_read section because allocating it can sleep. afterwards, we'll
>> + ? ? ?* need to restart iteration on the threadgroup list - the whole thing
>> + ? ? ?* will be O(nm) in the number of threads and css_sets; as the typical
>> + ? ? ?* case has only one css_set for all of them, usually O(n). which ones
Maybe better to say "in the worst case this is O(n^2) on the number of
threads; however, in the vast majority of cases all the threads will
be in the same cgroups as the leader and we'll make a just single pass
through the list with no additional allocations needed".
>
> It's going away but seems to exist for a while....then, "continue" is safe
> for keeping consistency ?
Yes, because we don't sleep so the RCU list is still valid.
>> + ? ? ? ? ? ? /* see if the new one for us is already in the list? */
/* See if we already have an appropriate css_set for this thread */
>> + ? ? ? ? ? ? ? ? ? ? /* begin iteration again. */
/* Since we may have slept in css_set_prefetch(), the RCU list is no
longer valid, so we must begin the iteration again; Any threads that
we've previously processed will pass the css_set_check_fetched() test
on subsequent iterations since we hold cgroup_lock, so we're
guaranteed to make progress. */
> Does this function work well if the process has 10000+ threads ?
In general there'll only be one cgroup so it'll be a single pass
through the list.
>
> How about this logic ?
> ==
>
> ? ? ? ?/* At first, find out necessary things */
> ? ? ? ?rcu_read_lock();
> ? ? ? ?list_for_each_entry_rcu() {
> ? ? ? ? ? ? ? ?oldcgrp = task_cgroup_from_root(tsk, root);
> ? ? ? ? ? ? ? ?if (oldcgrp == cgrp)
> ? ? ? ? ? ? ? ? ? ? ? ?continue;
> ? ? ? ? ? ? ? ?task_lock(task);
> ? ? ? ? ? ? ? ?if (task->flags & PF_EXITING) {
> ? ? ? ? ? ? ? ? ? ? ? ?task_unlock(task);
> ? ? ? ? ? ? ? ? ? ? ? ?continue;
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?oldcg = tsk->cgroups;
> ? ? ? ? ? ? ? ?get_css_set(oldcg);
> ? ? ? ? ? ? ? ?task_unlock(task);
> ? ? ? ? ? ? ? ?read_lock(&css_set_lock);
> ? ? ? ? ? ? ? ?newcg = find_existing_css_set(oldcgrp cgrp, template);
> ? ? ? ? ? ? ? ?if (newcg)
> ? ? ? ? ? ? ? ? ? ? ? ?remember_this_newcg(newcg, &found_cg_array); {
> ? ? ? ? ? ? ? ? ? ? ? ?put_css_set(oldcg);
> ? ? ? ? ? ? ? ?} else
> ? ? ? ? ? ? ? ? ? ? ? ?remember_need_to_allocate(oldcg, &need_to_allocate_array);
The problem with this is that remember_need_to_allocate() will itself
need to allocate memory in order to allow need_to_allocate_array to
expand arbitrarily. Which can't be done without GFP_ATOMIC or else
sleeping in the RCU section, neither of which are good.
>> +list_teardown:
>> + ? ? /* clean up the list of prefetched css_sets. */
>> + ? ? list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
>> + ? ? ? ? ? ? list_del(&cg_entry->links);
>> + ? ? ? ? ? ? put_css_set(cg_entry->cg);
>> + ? ? ? ? ? ? kfree(cg_entry);
>> + ? ? }
I wonder if we might need a synchronize_rcu() here?
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1404,6 +1404,10 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>> ? ? ? ? ? ? ? struct task_struct *c;
>>
>> ? ? ? ? ? ? ? rcu_read_lock();
>> + ? ? ? ? ? ? if (!thread_group_leader(tsk)) {
>> + ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
>> + ? ? ? ? ? ? ? ? ? ? return -EAGAIN;
>> + ? ? ? ? ? ? }
Why are you adding this requirement, here and in sched.c? (ns_cgroup.c
doesn't matter since it's being deleted).
Paul
On Wed, Aug 04, 2010 at 10:08:11AM +0900, KAMEZAWA Hiroyuki wrote:
> On Fri, 30 Jul 2010 19:59:02 -0400
> Ben Blum <[email protected]> wrote:
>
> > Makes procs file writable to move all threads by tgid at once
> >
> > From: Ben Blum <[email protected]>
> >
> > This patch adds functionality that enables users to move all threads in a
> > threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> > file. This current implementation makes use of a per-threadgroup rwsem that's
> > taken for reading in the fork() path to prevent newly forking threads within
> > the threadgroup from "escaping" while the move is in progress.
> >
> > Signed-off-by: Ben Blum <[email protected]>
> > ---
> > Documentation/cgroups/cgroups.txt | 13 +
> > kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++----
> > kernel/cgroup_freezer.c | 4
> > kernel/cpuset.c | 4
> > kernel/ns_cgroup.c | 4
> > kernel/sched.c | 4
> > 6 files changed, 405 insertions(+), 50 deletions(-)
> >
> > diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> > index b34823f..5f3c707 100644
> > --- a/Documentation/cgroups/cgroups.txt
> > +++ b/Documentation/cgroups/cgroups.txt
> > @@ -235,7 +235,8 @@ containing the following files describing that cgroup:
> > - cgroup.procs: list of tgids in the cgroup. This list is not
> > guaranteed to be sorted or free of duplicate tgids, and userspace
> > should sort/uniquify the list if this property is required.
> > - This is a read-only file, for now.
> > + Writing a thread group id into this file moves all threads in that
> > + group into this cgroup.
> > - notify_on_release flag: run the release agent on exit?
> > - release_agent: the path to use for release notifications (this file
> > exists in the top cgroup only)
> > @@ -416,6 +417,12 @@ You can attach the current shell task by echoing 0:
> >
> > # echo 0 > tasks
> >
> > +You can use the cgroup.procs file instead of the tasks file to move all
> > +threads in a threadgroup at once. Echoing the pid of any task in a
> > +threadgroup to cgroup.procs causes all tasks in that threadgroup to be
> > +be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
> > +in the writing task's threadgroup.
> > +
> > 2.3 Mounting hierarchies by name
> > --------------------------------
> >
> > @@ -564,7 +571,9 @@ called on a fork. If this method returns 0 (success) then this should
> > remain valid while the caller holds cgroup_mutex and it is ensured that either
> > attach() or cancel_attach() will be called in future. If threadgroup is
> > true, then a successful result indicates that all threads in the given
> > -thread's threadgroup can be moved together.
> > +thread's threadgroup can be moved together. If the subsystem wants to
> > +iterate over task->thread_group, it must take rcu_read_lock then check
> > +if thread_group_leader(task), returning -EAGAIN if that fails.
> >
> > void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > struct task_struct *task, bool threadgroup)
> > diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> > index f91d7dd..fab8c87 100644
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -1688,6 +1688,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
> > }
> > EXPORT_SYMBOL_GPL(cgroup_path);
> >
> > +/*
> > + * cgroup_task_migrate - move a task from one cgroup to another.
> > + *
> > + * 'guarantee' is set if the caller promises that a new css_set for the task
> > + * will already exit. If not set, this function might sleep, and can fail with
>
> already exist ?
oops, yes. good catch.
> > + * -ENOMEM. Otherwise, it can only fail with -ESRCH.
> > + */
> > +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
> > + struct task_struct *tsk, bool guarantee)
> > +{
> > + struct css_set *oldcg;
> > + struct css_set *newcg;
> > +
> > + /*
> > + * get old css_set. we need to take task_lock and refcount it, because
> > + * an exiting task can change its css_set to init_css_set and drop its
> > + * old one without taking cgroup_mutex.
> > + */
> > + task_lock(tsk);
> > + oldcg = tsk->cgroups;
> > + get_css_set(oldcg);
> > + task_unlock(tsk);
> > +
> > + /* locate or allocate a new css_set for this task. */
> > + if (guarantee) {
> > + /* we know the css_set we want already exists. */
> > + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> > + read_lock(&css_set_lock);
> > + newcg = find_existing_css_set(oldcg, cgrp, template);
> > + BUG_ON(!newcg);
> > + get_css_set(newcg);
> > + read_unlock(&css_set_lock);
> > + } else {
> > + might_sleep();
> > + /* find_css_set will give us newcg already referenced. */
> > + newcg = find_css_set(oldcg, cgrp);
> > + if (!newcg) {
> > + put_css_set(oldcg);
> > + return -ENOMEM;
> > + }
> > + }
> > + put_css_set(oldcg);
> > +
> > + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
> > + task_lock(tsk);
> > + if (tsk->flags & PF_EXITING) {
> > + task_unlock(tsk);
> > + put_css_set(newcg);
> > + return -ESRCH;
> > + }
> > + rcu_assign_pointer(tsk->cgroups, newcg);
> > + task_unlock(tsk);
> > +
> > + /* Update the css_set linked lists if we're using them */
> > + write_lock(&css_set_lock);
> > + if (!list_empty(&tsk->cg_list))
> > + list_move(&tsk->cg_list, &newcg->tasks);
> > + write_unlock(&css_set_lock);
> > +
> > + /*
> > + * We just gained a reference on oldcg by taking it from the task. As
> > + * trading it for newcg is protected by cgroup_mutex, we're safe to drop
> > + * it here; it will be freed under RCU.
> > + */
> > + put_css_set(oldcg);
> > +
> > + set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> > + return 0;
> > +}
> > +
> > /**
> > * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
> > * @cgrp: the cgroup the task is attaching to
> > @@ -1698,11 +1768,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
> > */
> > int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > {
> > - int retval = 0;
> > + int retval;
> > struct cgroup_subsys *ss, *failed_ss = NULL;
> > struct cgroup *oldcgrp;
> > - struct css_set *cg;
> > - struct css_set *newcg;
> > struct cgroupfs_root *root = cgrp->root;
> >
> > /* Nothing to do if the task is already in that cgroup */
> > @@ -1726,46 +1794,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > }
> > }
> >
> > - task_lock(tsk);
> > - cg = tsk->cgroups;
> > - get_css_set(cg);
> > - task_unlock(tsk);
> > - /*
> > - * Locate or allocate a new css_set for this task,
> > - * based on its final set of cgroups
> > - */
> > - newcg = find_css_set(cg, cgrp);
> > - put_css_set(cg);
> > - if (!newcg) {
> > - retval = -ENOMEM;
> > - goto out;
> > - }
> > -
> > - task_lock(tsk);
> > - if (tsk->flags & PF_EXITING) {
> > - task_unlock(tsk);
> > - put_css_set(newcg);
> > - retval = -ESRCH;
> > + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
> > + if (retval)
> > goto out;
> > - }
> > - rcu_assign_pointer(tsk->cgroups, newcg);
> > - task_unlock(tsk);
> > -
> > - /* Update the css_set linked lists if we're using them */
> > - write_lock(&css_set_lock);
> > - if (!list_empty(&tsk->cg_list)) {
> > - list_del(&tsk->cg_list);
> > - list_add(&tsk->cg_list, &newcg->tasks);
> > - }
> > - write_unlock(&css_set_lock);
> >
> > for_each_subsys(root, ss) {
> > if (ss->attach)
> > ss->attach(ss, cgrp, oldcgrp, tsk, false);
> > }
> > - set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> > +
>
> Hmm. By this, we call ss->attach(ss, cgrp, oldcgrp, tsk, false) after
> makring CGRP_RELEASABLE+synchronize_rcu() to oldcgroup...is it safe ?
I honestly don't remember (that logic was written like a year ago), but
I remember Paul confirming that it was ok. But things may have changed
around - I don't recall any "cgroup_release_and_wakeup_rmdir" semantics.
> And why move it before attach() ?
Makes it easier when there are arbitrarily many "oldcgrp"s - once you
migrate each task, you won't have its old cgroup to set the bit on by
the time you call attach().
> > synchronize_rcu();
> > - put_css_set(cg);
> >
> > /*
> > * wake up rmdir() waiter. the rmdir should fail since the cgroup
> > @@ -1791,49 +1829,341 @@ out:
> > }
> >
> > /*
> > - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
> > - * held. May take task_lock of task
> > + * cgroup_attach_proc works in two stages, the first of which prefetches all
> > + * new css_sets needed (to make sure we have enough memory before committing
> > + * to the move) and stores them in a list of entries of the following type.
> > + * TODO: possible optimization: use css_set->rcu_head for chaining instead
> > + */
> > +struct cg_list_entry {
> > + struct css_set *cg;
> > + struct list_head links;
> > +};
> > +
> > +static bool css_set_check_fetched(struct cgroup *cgrp,
> > + struct task_struct *tsk, struct css_set *cg,
> > + struct list_head *newcg_list)
> > +{
> > + struct css_set *newcg;
> > + struct cg_list_entry *cg_entry;
> > + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> > +
> > + read_lock(&css_set_lock);
> > + newcg = find_existing_css_set(cg, cgrp, template);
> > + if (newcg)
> > + get_css_set(newcg);
> > + read_unlock(&css_set_lock);
> > +
> > + /* doesn't exist at all? */
> > + if (!newcg)
> > + return false;
> > + /* see if it's already in the list */
> > + list_for_each_entry(cg_entry, newcg_list, links) {
> > + if (cg_entry->cg == newcg) {
> > + put_css_set(newcg);
> > + return true;
> > + }
> > + }
> > +
> > + /* not found */
> > + put_css_set(newcg);
> > + return false;
> > +}
> > +
> > +/*
> > + * Find the new css_set and store it in the list in preparation for moving the
> > + * given task to the given cgroup. Returns 0 or -ENOMEM.
> > */
> > -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
> > +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> > + struct list_head *newcg_list)
> > +{
> > + struct css_set *newcg;
> > + struct cg_list_entry *cg_entry;
> > +
> > + /* ensure a new css_set will exist for this thread */
> > + newcg = find_css_set(cg, cgrp);
> > + if (!newcg)
> > + return -ENOMEM;
> > + /* add it to the list */
> > + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
> > + if (!cg_entry) {
> > + put_css_set(newcg);
> > + return -ENOMEM;
> > + }
> > + cg_entry->cg = newcg;
> > + list_add(&cg_entry->links, newcg_list);
> > + return 0;
> > +}
> > +
> > +/**
> > + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
> > + * @cgrp: the cgroup to attach to
> > + * @leader: the threadgroup leader task_struct of the group to be attached
> > + *
> > + * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
> > + * threadgroup individually in turn.
> > + */
> > +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> > +{
> > + int retval;
> > + struct cgroup_subsys *ss, *failed_ss = NULL;
> > + struct cgroup *oldcgrp;
> > + struct css_set *oldcg;
> > + struct cgroupfs_root *root = cgrp->root;
> > + /* threadgroup list cursor */
> > + struct task_struct *tsk;
> > + /*
> > + * we need to make sure we have css_sets for all the tasks we're
> > + * going to move -before- we actually start moving them, so that in
> > + * case we get an ENOMEM we can bail out before making any changes.
> > + */
> > + struct list_head newcg_list;
> > + struct cg_list_entry *cg_entry, *temp_nobe;
> > +
> > + /* check that we can legitimately attach to the cgroup. */
> > + for_each_subsys(root, ss) {
> > + if (ss->can_attach) {
> > + retval = ss->can_attach(ss, cgrp, leader, true);
> > + if (retval) {
> > + failed_ss = ss;
> > + goto out;
> > + }
> > + }
> > + }
>
> Then, we cannot do attach limitaion control per thread ? (This just check leader.)
> Is it ok for all subsys ?
I believe it should be. At least for memory, there's no point to check
multiple threads that all share the same VM. :)
> > +
> > + /*
> > + * step 1: make sure css_sets exist for all threads to be migrated.
> > + * we use find_css_set, which allocates a new one if necessary.
> > + */
> > + INIT_LIST_HEAD(&newcg_list);
> > + oldcgrp = task_cgroup_from_root(leader, root);
> > + if (cgrp != oldcgrp) {
> > + /* get old css_set */
> > + task_lock(leader);
> > + if (leader->flags & PF_EXITING) {
> > + task_unlock(leader);
> > + goto prefetch_loop;
> > + }
> Why do we continue here ? not -ESRCH ?
The leader can exit and still have other threads going in its
threadgroup; in this case, we still want to move the rest of the
threads.
> > + oldcg = leader->cgroups;
> > + get_css_set(oldcg);
> > + task_unlock(leader);
> > + /* acquire new one */
> > + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> > + put_css_set(oldcg);
> > + if (retval)
> > + goto list_teardown;
> > + }
> > +prefetch_loop:
> > + rcu_read_lock();
> > + /* sanity check - if we raced with de_thread, we must abort */
> > + if (!thread_group_leader(leader)) {
> > + retval = -EAGAIN;
> > + goto list_teardown;
> > + }
>
> EAGAIN ? ESRCH ? or EBUSY ?
This happens in the following case: we have a pointer to A the leader, A
forks B, B exec, B becomes new leader. (It's dangerous if, after that,
this happens: B forks C, B exits. Now A->leader and A->thread_group.next
both point to nowhere. Thanks Oleg :) )
EBUSY might also be ok; I picked EAGAIN because it fits meaning-wise -
it's handled higher-up in the VFS write handler, so userspace doesn't
see it.
> > + /*
> > + * if we need to fetch a new css_set for this task, we must exit the
> > + * rcu_read section because allocating it can sleep. afterwards, we'll
> > + * need to restart iteration on the threadgroup list - the whole thing
> > + * will be O(nm) in the number of threads and css_sets; as the typical
> > + * case has only one css_set for all of them, usually O(n). which ones
> > + * we need allocated won't change as long as we hold cgroup_mutex.
> > + */
> > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> > + /* nothing to do if this task is already in the cgroup */
> > + oldcgrp = task_cgroup_from_root(tsk, root);
> > + if (cgrp == oldcgrp)
> > + continue;
> > + /* get old css_set pointer */
> > + task_lock(tsk);
> > + if (tsk->flags & PF_EXITING) {
> > + /* ignore this task if it's going away */
> > + task_unlock(tsk);
>
> It's going away but seems to exist for a while....then, "continue" is safe
> for keeping consistency ?
Yes, it's going away but hasn't been unhashed yet. Since it's on the
thread_group list (and we have rcu_read), of course its next pointer is
sane.
> > + continue;
> > + }
> > + oldcg = tsk->cgroups;
> > + get_css_set(oldcg);
> > + task_unlock(tsk);
> > + /* see if the new one for us is already in the list? */
> > + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
> > + /* was already there, nothing to do. */
> > + put_css_set(oldcg);
> > + } else {
> > + /* we don't already have it. get new one. */
> > + rcu_read_unlock();
> > + retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> > + put_css_set(oldcg);
> > + if (retval)
> > + goto list_teardown;
> > + /* begin iteration again. */
> > + goto prefetch_loop;
>
> Hmm ? Why do we need to restart from the 1st entry ?
> (maybe because of rcu_read_unlock() ?)
Need to allocate (prefetch), can't do it while rcu_read is held.
> Does this function work well if the process has 10000+ threads ?
Depends on the css_sets - it is pretty unlikely that the threads will be
diversified enough that runtime will actually approach quadratic, and in
that case I'd rather have bad runtime (this is already an expensive
operation) than more complicated logic (which you proposed). Lord knows
it's already complicated enough :X
>
> How about this logic ?
> ==
>
> /* At first, find out necessary things */
> rcu_read_lock();
> list_for_each_entry_rcu() {
> oldcgrp = task_cgroup_from_root(tsk, root);
> if (oldcgrp == cgrp)
> continue;
> task_lock(task);
> if (task->flags & PF_EXITING) {
> task_unlock(task);
> continue;
> }
> oldcg = tsk->cgroups;
> get_css_set(oldcg);
> task_unlock(task);
> read_lock(&css_set_lock);
> newcg = find_existing_css_set(oldcgrp cgrp, template);
> if (newcg)
> remember_this_newcg(newcg, &found_cg_array); {
> put_css_set(oldcg);
> } else
> remember_need_to_allocate(oldcg, &need_to_allocate_array);
> }
> rcu_read_unlock();
> /* Sort all cg_list found and drop doubly counted ones, drop refcnt if necessary */
> sort_and_unique(found_cg_array);
> /* Sort all cg_list not found and drop doubly counted ones, drop refcnt if necessary */
> sort_and_unique(need_to_allocate_array);
> /* Allocate new ones */
> newly_allocated_array = allocate_new_cg_lists(need_to_allocate_array);
> drop_refcnt_of_old_cgs(need_to_allocate_array);
>
> /* Now we have all necessary cg_list */
>
> > + }
> > + }
> > + rcu_read_unlock();
> > +
> > + /*
> > + * step 2: now that we're guaranteed success wrt the css_sets, proceed
> > + * to move all tasks to the new cgroup. we need to lock against possible
> > + * races with fork(). note: we can safely access leader->signal because
> > + * attach_task_by_pid takes a reference on leader, which guarantees that
> > + * the signal_struct will stick around. threadgroup_fork_lock must be
> > + * taken outside of tasklist_lock to match the order in the fork path.
> > + */
> > + BUG_ON(!leader->signal);
> > + down_write(&leader->signal->threadgroup_fork_lock);
> > + read_lock(&tasklist_lock);
> > + /* sanity check - if we raced with de_thread, we must abort */
> > + if (!thread_group_leader(leader)) {
> > + retval = -EAGAIN;
> > + read_unlock(&tasklist_lock);
> > + up_write(&leader->signal->threadgroup_fork_lock);
> > + goto list_teardown;
> > + }
> > + /*
> > + * No failure cases left, so this is the commit point.
> > + *
> > + * If the leader is already there, skip moving him. Note: even if the
> > + * leader is PF_EXITING, we still move all other threads; if everybody
> > + * is PF_EXITING, we end up doing nothing, which is ok.
> > + */
> > + oldcgrp = task_cgroup_from_root(leader, root);
> > + if (cgrp != oldcgrp) {
> > + retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
> > + BUG_ON(retval != 0 && retval != -ESRCH);
> > + }
> > + /* Now iterate over each thread in the group. */
> > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> > + BUG_ON(tsk->signal != leader->signal);
> > + /* leave current thread as it is if it's already there */
> > + oldcgrp = task_cgroup_from_root(tsk, root);
> > + if (cgrp == oldcgrp)
> > + continue;
> > + /* we don't care whether these threads are exiting */
> > + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
> > + BUG_ON(retval != 0 && retval != -ESRCH);
> > + }
> > +
> > + /*
> > + * step 3: attach whole threadgroup to each subsystem
> > + * TODO: if ever a subsystem needs to know the oldcgrp for each task
> > + * being moved, this call will need to be reworked to communicate that.
> > + */
> > + for_each_subsys(root, ss) {
> > + if (ss->attach)
> > + ss->attach(ss, cgrp, oldcgrp, leader, true);
> > + }
> > + /* holding these until here keeps us safe from exec() and fork(). */
> > + read_unlock(&tasklist_lock);
> > + up_write(&leader->signal->threadgroup_fork_lock);
> > +
> > + /*
> > + * step 4: success! and cleanup
> > + */
> > + synchronize_rcu();
> > + cgroup_wakeup_rmdir_waiter(cgrp);
> > + retval = 0;
> > +list_teardown:
> > + /* clean up the list of prefetched css_sets. */
> > + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
> > + list_del(&cg_entry->links);
> > + put_css_set(cg_entry->cg);
> > + kfree(cg_entry);
> > + }
> > +out:
> > + if (retval) {
> > + /* same deal as in cgroup_attach_task, with threadgroup=true */
> > + for_each_subsys(root, ss) {
> > + if (ss == failed_ss)
> > + break;
> > + if (ss->cancel_attach)
> > + ss->cancel_attach(ss, cgrp, leader, true);
> > + }
> > + }
> > + return retval;
> > +}
> > +
> > +/*
> > + * Find the task_struct of the task to attach by vpid and pass it along to the
> > + * function to attach either it or all tasks in its threadgroup. Will take
> > + * cgroup_mutex; may take task_lock of task.
> > + */
> > +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
> > {
> > struct task_struct *tsk;
> > const struct cred *cred = current_cred(), *tcred;
> > int ret;
> >
> > + if (!cgroup_lock_live_group(cgrp))
> > + return -ENODEV;
> > +
> > if (pid) {
> > rcu_read_lock();
> > tsk = find_task_by_vpid(pid);
> > - if (!tsk || tsk->flags & PF_EXITING) {
> > + if (!tsk) {
> > + rcu_read_unlock();
> > + cgroup_unlock();
> > + return -ESRCH;
> > + }
> > + if (threadgroup) {
> > + /*
> > + * it is safe to find group_leader because tsk was found
> > + * in the tid map, meaning it can't have been unhashed
> > + * by someone in de_thread changing the leadership.
> > + */
> > + tsk = tsk->group_leader;
> > + BUG_ON(!thread_group_leader(tsk));
> > + } else if (tsk->flags & PF_EXITING) {
> > + /* optimization for the single-task-only case */
> > rcu_read_unlock();
> > + cgroup_unlock();
> > return -ESRCH;
> > }
> >
> > + /*
> > + * even if we're attaching all tasks in the thread group, we
> > + * only need to check permissions on one of them.
> > + */
> > tcred = __task_cred(tsk);
> > if (cred->euid &&
> > cred->euid != tcred->uid &&
> > cred->euid != tcred->suid) {
> > rcu_read_unlock();
> > + cgroup_unlock();
> > return -EACCES;
> > }
> > get_task_struct(tsk);
> > rcu_read_unlock();
> > } else {
> > - tsk = current;
> > + if (threadgroup)
> > + tsk = current->group_leader;
> > + else
>
> I'm not sure but "group_leader" is safe to access here ?
current->group_leader is always safe, since current should have a
refcount on its leader.
>
> > + tsk = current;
> > get_task_struct(tsk);
> > }
> >
> > - ret = cgroup_attach_task(cgrp, tsk);
> > + if (threadgroup)
> > + ret = cgroup_attach_proc(cgrp, tsk);
> > + else
> > + ret = cgroup_attach_task(cgrp, tsk);
> > put_task_struct(tsk);
> > + cgroup_unlock();
> > return ret;
> > }
> >
> > static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
> > {
> > + return attach_task_by_pid(cgrp, pid, false);
> > +}
> > +
> > +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
> > +{
> > int ret;
> > - if (!cgroup_lock_live_group(cgrp))
> > - return -ENODEV;
> > - ret = attach_task_by_pid(cgrp, pid);
> > - cgroup_unlock();
> > + do {
> > + /*
> > + * attach_proc fails with -EAGAIN if threadgroup leadership
> > + * changes in the middle of the operation, in which case we need
> > + * to find the task_struct for the new leader and start over.
> > + */
> > + ret = attach_task_by_pid(cgrp, tgid, true);
> > + } while (ret == -EAGAIN);
> > return ret;
> > }
> >
> > @@ -3168,9 +3498,9 @@ static struct cftype files[] = {
> > {
> > .name = CGROUP_FILE_GENERIC_PREFIX "procs",
> > .open = cgroup_procs_open,
> > - /* .write_u64 = cgroup_procs_write, TODO */
> > + .write_u64 = cgroup_procs_write,
> > .release = cgroup_pidlist_release,
> > - .mode = S_IRUGO,
> > + .mode = S_IRUGO | S_IWUSR,
> > },
> > {
> > .name = "notify_on_release",
> > diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> > index ce71ed5..daf0249 100644
> > --- a/kernel/cgroup_freezer.c
> > +++ b/kernel/cgroup_freezer.c
> > @@ -190,6 +190,10 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
> > struct task_struct *c;
> >
> > rcu_read_lock();
> > + if (!thread_group_leader(task)) {
> > + rcu_read_unlock();
> > + return -EAGAIN;
> > + }
> > list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> > if (is_task_frozen_enough(c)) {
> > rcu_read_unlock();
> > diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> > index b23c097..3d7c978 100644
> > --- a/kernel/cpuset.c
> > +++ b/kernel/cpuset.c
> > @@ -1404,6 +1404,10 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> > struct task_struct *c;
> >
> > rcu_read_lock();
> > + if (!thread_group_leader(tsk)) {
> > + rcu_read_unlock();
> > + return -EAGAIN;
> > + }
> > list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> > ret = security_task_setscheduler(c, 0, NULL);
> > if (ret) {
> > diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
> > index 2a5dfec..ecd15d2 100644
> > --- a/kernel/ns_cgroup.c
> > +++ b/kernel/ns_cgroup.c
> > @@ -59,6 +59,10 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
> > if (threadgroup) {
> > struct task_struct *c;
> > rcu_read_lock();
> > + if (!thread_group_leader(task)) {
> > + rcu_read_unlock();
> > + return -EAGAIN;
> > + }
> > list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> > if (!cgroup_is_descendant(new_cgroup, c)) {
> > rcu_read_unlock();
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index 70fa78d..df53f53 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -8721,6 +8721,10 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > if (threadgroup) {
> > struct task_struct *c;
> > rcu_read_lock();
> > + if (!thread_group_leader(tsk)) {
> > + rcu_read_unlock();
> > + return -EAGAIN;
> > + }
> > list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> > retval = cpu_cgroup_can_attach_task(cgrp, c);
> > if (retval) {
>
>
> Thanks,
> -Kame
>
>
Thanks for having a full look over!
-- Ben
On Tue, Aug 03, 2010 at 08:44:01PM -0700, Paul Menage wrote:
> On Fri, Jul 30, 2010 at 4:57 PM, Ben Blum <[email protected]> wrote:
> > + * The threadgroup_fork_lock prevents threads from forking with
> > + * CLONE_THREAD while held for writing. Use this for fork-sensitive
> > + * threadgroup-wide operations. It's taken for reading in fork.c in
> > + * copy_process().
> > + * Currently only needed write-side by cgroups.
> > + */
> > + struct rw_semaphore threadgroup_fork_lock;
> > +#endif
>
> I'm not sure how best to word this comment, but I'd prefer something like:
>
> "The threadgroup_fork_lock is taken in read mode during a CLONE_THREAD
> fork operation; taking it in write mode prevents the owning
> threadgroup from adding any new threads and thus allows you to
> synchronize against the addition of unseen threads when performing
> threadgroup-wide operations. New-process forks (without CLONE_THREAD)
> are not affected."
That sounds good.
> As far as the #ifdef mess goes, it's true that some people don't have
> CONFIG_CGROUPS defined. I'd imagine that these are likely to be
> embedded systems with a fairly small number of processes and threads
> per process. Are there really any such platforms where the cost of a
> single extra rwsem per process is going to make a difference either in
> terms of memory or lock contention? I think you should consider making
> these additions unconditional.
That's certainly an option, but I think it would be clean enough to put
static inline functions just under the signal_struct definition.
Thoughts?
>
> Paul
>
-- Ben
On Tue, Aug 3, 2010 at 9:33 PM, Ben Blum <[email protected]> wrote:
>> As far as the #ifdef mess goes, it's true that some people don't have
>> CONFIG_CGROUPS defined. I'd imagine that these are likely to be
>> embedded systems with a fairly small number of processes and threads
>> per process. Are there really any such platforms where the cost of a
>> single extra rwsem per process is going to make a difference either in
>> terms of memory or lock contention? I think you should consider making
>> these additions unconditional.
>
> That's certainly an option, but I think it would be clean enough to put
> static inline functions just under the signal_struct definition.
Either sounds fine to me. I suspect others have a stronger opinion.
Paul
On Tue, Aug 03, 2010 at 09:30:00PM -0700, Paul Menage wrote:
> >> --- a/kernel/cpuset.c
> >> +++ b/kernel/cpuset.c
> >> @@ -1404,6 +1404,10 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> >> ? ? ? ? ? ? ? struct task_struct *c;
> >>
> >> ? ? ? ? ? ? ? rcu_read_lock();
> >> + ? ? ? ? ? ? if (!thread_group_leader(tsk)) {
> >> + ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> >> + ? ? ? ? ? ? ? ? ? ? return -EAGAIN;
> >> + ? ? ? ? ? ? }
>
> Why are you adding this requirement, here and in sched.c? (ns_cgroup.c
> doesn't matter since it's being deleted).
>
> Paul
It was either this or:
rcu_read_lock();
for_each_subsys(...) {
can_attach(...);
}
rcu_read_unlock();
Which forces all can_attaches to not sleep. So by dropping
rcu_read_lock(), we allow the possibility of the exec race I described
in my last email, and therefore we have to check each time we re-acquire
rcu_read to iterate thread_group.
Yeah, it is not pretty. I call it "double-double-toil-and-trouble-check
locking". But it is safe.
-- Ben
On Tue, Aug 3, 2010 at 9:38 PM, Ben Blum <[email protected]> wrote:
>
> rcu_read_lock();
> for_each_subsys(...) {
> ? ? ? ?can_attach(...);
> }
> rcu_read_unlock();
Sorry, I was misreading this, and didn't notice that it was already
inside an "if (threadgroup) {}" test.
>
> Which forces all can_attaches to not sleep. So by dropping
> rcu_read_lock(), we allow the possibility of the exec race I described
> in my last email, and therefore we have to check each time we re-acquire
> rcu_read to iterate thread_group.
Agreed.
>
> Yeah, it is not pretty. I call it "double-double-toil-and-trouble-check
> locking". But it is safe.
As a cleanup, I'd be inclined to have a wrapper in cgroup.c, something like
cgroup_can_attach_threadgroup(struct cgroup_subsys *ss, struct cgroup
*cg, struct task_struct *leader, int (*cb)(struct task_struct *t,
struct cgroup *cg))
which handles the RCU section, checking threadgroup_leader(), and
looping through each thread. The the subsystem just has to define a
callback which will be called for each thread.
Paul
> As far as the #ifdef mess goes, it's true that some people don't have
> CONFIG_CGROUPS defined. I'd imagine that these are likely to be
> embedded systems with a fairly small number of processes and threads
> per process. Are there really any such platforms where the cost of a
> single extra rwsem per process is going to make a difference either in
> terms of memory or lock contention? I think you should consider making
> these additions unconditional.
openSUSE's default kernel* doesn't have CONFIG_CGROUPS
Personally I think it's a silly mistake also, since the argument was
performance, and ubuntu's desktop kernel has it and actually outperforms
openSUSE's, and the feature is perfectly likely to be needed by
"desktop" users, but I wasn't and still aren't consulted on this and so
it's a fact that must be lived with at least for a while. ;) (at least
until the majority of 11.2 and 11.3 installations are replaced due to
age, and that's if they reverted the decision today, which so far they
haven't)
*("kernel-desktop" the one installed by default, not "kernel-default",
which exists but is not installed by default since openSUSE 11.2)
--
bkw
On Tue, Aug 03, 2010 at 09:34:22PM -0700, Paul Menage wrote:
> On Tue, Aug 3, 2010 at 9:33 PM, Ben Blum <[email protected]> wrote:
> >> As far as the #ifdef mess goes, it's true that some people don't have
> >> CONFIG_CGROUPS defined. I'd imagine that these are likely to be
> >> embedded systems with a fairly small number of processes and threads
> >> per process. Are there really any such platforms where the cost of a
> >> single extra rwsem per process is going to make a difference either in
> >> terms of memory or lock contention? I think you should consider making
> >> these additions unconditional.
> >
> > That's certainly an option, but I think it would be clean enough to put
> > static inline functions just under the signal_struct definition.
>
> Either sounds fine to me. I suspect others have a stronger opinion.
>
> Paul
>
Any other votes? One set of static inline functions (I'd call them
threadgroup_fork_{read,write}_{un,}lock) or just remove the ifdefs
entirely? I'm inclined to go with the former.
-- Ben
On Fri, 6 Aug 2010 02:02:24 -0400
Ben Blum <[email protected]> wrote:
> On Tue, Aug 03, 2010 at 09:34:22PM -0700, Paul Menage wrote:
> > On Tue, Aug 3, 2010 at 9:33 PM, Ben Blum <[email protected]> wrote:
> > >> As far as the #ifdef mess goes, it's true that some people don't have
> > >> CONFIG_CGROUPS defined. I'd imagine that these are likely to be
> > >> embedded systems with a fairly small number of processes and threads
> > >> per process. Are there really any such platforms where the cost of a
> > >> single extra rwsem per process is going to make a difference either in
> > >> terms of memory or lock contention? I think you should consider making
> > >> these additions unconditional.
> > >
> > > That's certainly an option, but I think it would be clean enough to put
> > > static inline functions just under the signal_struct definition.
> >
> > Either sounds fine to me. I suspect others have a stronger opinion.
> >
> > Paul
> >
>
> Any other votes? One set of static inline functions (I'd call them
> threadgroup_fork_{read,write}_{un,}lock) or just remove the ifdefs
> entirely? I'm inclined to go with the former.
>
I vote for the former. #ifdef can be easily removed if someone finds it useful
for other purpose...and static inline function is usual way.
Thanks,
-Kame
On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
>
> This patch series implements a write function for the 'cgroup.procs'
> per-cgroup file, which enables atomic movement of multithreaded
> applications between cgroups. Writing the thread-ID of any thread in a
> threadgroup to a cgroup's procs file causes all threads in the group to
> be moved to that cgroup safely with respect to threads forking/exiting.
> (Possible usage scenario: If running a multithreaded build system that
> sucks up system resources, this lets you restrict it all at once into a
> new cgroup to keep it under control.)
>
> Example: Suppose pid 31337 clones new threads 31338 and 31339.
>
> # cat /dev/cgroup/tasks
> ...
> 31337
> 31338
> 31339
> # mkdir /dev/cgroup/foo
> # echo 31337 > /dev/cgroup/foo/cgroup.procs
> # cat /dev/cgroup/foo/tasks
> 31337
> 31338
> 31339
>
> A new lock, called threadgroup_fork_lock and living in signal_struct, is
> introduced to ensure atomicity when moving threads between cgroups. It's
> taken for writing during the operation, and taking for reading in fork()
> around the calls to cgroup_fork() and cgroup_post_fork(). I put calls to
> down_read/up_read directly in copy_process(), since new inline functions
> seemed like overkill.
>
> -- Ben
>
> ---
> Documentation/cgroups/cgroups.txt | 13 -
> include/linux/init_task.h | 9
> include/linux/sched.h | 10
> kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++-----
> kernel/cgroup_freezer.c | 4
> kernel/cpuset.c | 4
> kernel/fork.c | 16 +
> kernel/ns_cgroup.c | 4
> kernel/sched.c | 4
> 9 files changed, 440 insertions(+), 50 deletions(-)
Here's an updated patchset. I've added an extra patch to implement the
callback scheme Paul suggested (note how there are twice as many deleted
lines of code as before :) ), and also moved the up_read/down_read calls
to static inline functions in sched.h near the other threadgroup-related
calls.
---
Documentation/cgroups/cgroups.txt | 13 -
include/linux/cgroup.h | 12
include/linux/init_task.h | 9
include/linux/sched.h | 35 ++
kernel/cgroup.c | 459 ++++++++++++++++++++++++++++++++++----
kernel/cgroup_freezer.c | 27 --
kernel/cpuset.c | 20 -
kernel/fork.c | 10
kernel/ns_cgroup.c | 27 +-
kernel/sched.c | 21 -
10 files changed, 526 insertions(+), 107 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 35 +++++++++++++++++++++++++++++++++++
kernel/fork.c | 10 ++++++++++
3 files changed, 54 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f43fa5..ca46711 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -29,6 +37,7 @@ extern struct fs_struct init_fs;
.running = 0, \
.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \
}, \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae69716..ebd4af2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -619,6 +619,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
};
@@ -2216,6 +2226,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+/* See the declaration of threadgroup_fork_lock in signal_struct. */
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->threadgroup_fork_lock);
+}
+#else
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+#endif
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65c..41df253 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
return 0;
@@ -1076,6 +1080,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1283,6 +1289,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1316,6 +1324,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Add cgroup wrapper for safely calling can_attach on all threads in a threadgroup
From: Ben Blum <[email protected]>
This patch adds a function cgroup_can_attach_per_thread which handles iterating
over each thread in a threadgroup safely with respect to the invariants that
will be used in cgroup_attach_proc. Also, subsystems whose can_attach calls
require per-thread validation are modified to use the per_thread wrapper to
avoid duplicating cgroup-internal code.
This is a pre-patch for cgroup-procs-writable.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/cgroup.h | 12 ++++++++++++
kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++
kernel/cgroup_freezer.c | 27 ++++++++++++---------------
kernel/cpuset.c | 20 +++++++-------------
kernel/ns_cgroup.c | 27 +++++++++++++--------------
kernel/sched.c | 21 ++++++---------------
6 files changed, 85 insertions(+), 57 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e3d00fd..f040d66 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -580,6 +580,18 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan);
int cgroup_attach_task(struct cgroup *, struct task_struct *);
/*
+ * For use in subsystems whose can_attach functions need to run an operation
+ * on every task in the threadgroup. Calls the given callback once if the
+ * 'threadgroup' flag is false, or once per thread in the group if true.
+ * The callback should return 0/-ERR; this will return 0/-ERR.
+ * The callback will run within an rcu_read section, so must not sleep.
+ */
+int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
+ int (*cb)(struct cgroup *cgrp,
+ struct task_struct *task),
+ bool threadgroup);
+
+/*
* CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
* if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
* CSS ID is assigned at cgroup allocation (create) automatically
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f91d7dd..e8b8f71 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1688,6 +1688,41 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
+ int (*cb)(struct cgroup *cgrp,
+ struct task_struct *task),
+ bool threadgroup)
+{
+ /* Start by running on the leader, in all cases. */
+ int ret = cb(cgrp, task);
+ if (ret < 0)
+ return ret;
+
+ if (threadgroup) {
+ /* Run on each task in the threadgroup. */
+ struct task_struct *c;
+ rcu_read_lock();
+ /*
+ * It is necessary for the given task to still be the leader
+ * to safely traverse thread_group. See cgroup_attach_proc.
+ */
+ if (!thread_group_leader(task)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ ret = cb(cgrp, c);
+ if (ret < 0) {
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed5..677b24e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -161,6 +161,13 @@ static bool is_task_frozen_enough(struct task_struct *task)
(task_is_stopped_or_traced(task) && freezing(task));
}
+static int freezer_can_attach_cb(struct cgroup *cgrp, struct task_struct *task)
+{
+ if (is_task_frozen_enough(task))
+ return -EBUSY;
+ return 0;
+}
+
/*
* The call to cgroup_lock() in the freezer.state write method prevents
* a write to that file racing against an attach, and hence the
@@ -171,6 +178,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
+ int ret;
/*
* Anything frozen can't move or be moved to/from.
@@ -179,26 +187,15 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
* frozen, so it's sufficient to check the latter condition.
*/
- if (is_task_frozen_enough(task))
- return -EBUSY;
+ ret = cgroup_can_attach_per_thread(new_cgroup, task,
+ freezer_can_attach_cb, threadgroup);
+ if (ret < 0)
+ return ret;
freezer = cgroup_freezer(new_cgroup);
if (freezer->state == CGROUP_FROZEN)
return -EBUSY;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (is_task_frozen_enough(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c097..cc4b1f7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1376,6 +1376,11 @@ static int fmeter_getrate(struct fmeter *fmp)
/* Protected by cgroup_lock */
static cpumask_var_t cpus_attach;
+static int cpuset_can_attach_cb(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task, 0, NULL);
+}
+
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct task_struct *tsk, bool threadgroup)
@@ -1397,22 +1402,11 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk, 0, NULL);
+ ret = cgroup_can_attach_per_thread(cont, tsk, cpuset_can_attach_cb,
+ threadgroup);
if (ret)
return ret;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c, 0, NULL);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec..af0accf 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,9 +42,18 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
+static int ns_can_attach_cb(struct cgroup *new_cgroup, struct task_struct *task)
+{
+ if (!cgroup_is_descendant(new_cgroup, task))
+ return -EPERM;
+ return 0;
+}
+
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
struct task_struct *task, bool threadgroup)
{
+ int ret;
+
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -53,20 +62,10 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
return -EPERM;
}
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
+ ret = cgroup_can_attach_per_thread(new_cgroup, task, ns_can_attach_cb,
+ threadgroup);
+ if (ret < 0)
+ return ret;
return 0;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 70fa78d..8330e6f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8715,21 +8715,12 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk, bool threadgroup)
{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
+ int ret = cgroup_can_attach_per_thread(cgrp, tsk,
+ cpu_cgroup_can_attach_task,
+ threadgroup);
+ if (ret)
+ return ret;
+
return 0;
}
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 13 +
kernel/cgroup.c | 424 +++++++++++++++++++++++++++++++++----
2 files changed, 387 insertions(+), 50 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index b34823f..5f3c707 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -235,7 +235,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -416,6 +417,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
@@ -564,7 +571,9 @@ called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
attach() or cancel_attach() will be called in future. If threadgroup is
true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+thread's threadgroup can be moved together. If the subsystem wants to
+iterate over task->thread_group, it must take rcu_read_lock then check
+if thread_group_leader(task), returning -EAGAIN if that fails.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e8b8f71..586dbb7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1723,6 +1723,76 @@ int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
}
EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exit. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1733,11 +1803,9 @@ EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1761,46 +1829,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1826,49 +1864,339 @@ out:
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /* check that we can legitimately attach to the cgroup. */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader, true);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ goto list_teardown;
+ }
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case has only one css_set for all of them, usually O(n). which ones
+ * we need allocated won't change as long as we hold cgroup_mutex.
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. we need to lock against possible
+ * races with fork(). note: we can safely take the threadgroup_fork_lock
+ * of leader since attach_task_by_pid took a reference.
+ * threadgroup_fork_lock must be taken outside of tasklist_lock to match
+ * the order in the fork path.
+ */
+ threadgroup_fork_write_lock(leader);
+ read_lock(&tasklist_lock);
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+ goto list_teardown;
+ }
+ /*
+ * No failure cases left, so this is the commit point.
+ *
+ * If the leader is already there, skip moving him. Note: even if the
+ * leader is PF_EXITING, we still move all other threads; if everybody
+ * is PF_EXITING, we end up doing nothing, which is ok.
+ */
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* Now iterate over each thread in the group. */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+
+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader, true);
+ }
+ /* holding these until here keeps us safe from exec() and fork(). */
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+
+ /*
+ * step 4: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out:
+ if (retval) {
+ /* same deal as in cgroup_attach_task, with threadgroup=true */
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader, true);
+ }
+ }
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * it is safe to find group_leader because tsk was found
+ * in the tid map, meaning it can't have been unhashed
+ * by someone in de_thread changing the leadership.
+ */
+ tsk = tsk->group_leader;
+ BUG_ON(!thread_group_leader(tsk));
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup)
+ ret = cgroup_attach_proc(cgrp, tsk);
+ else
+ ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3203,9 +3531,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
On Tue, Aug 10, 2010 at 10:48 PM, Ben Blum <[email protected]> wrote:
>
> Add cgroup wrapper for safely calling can_attach on all threads in a threadgroup
>
> From: Ben Blum <[email protected]>
>
> This patch adds a function cgroup_can_attach_per_thread which handles iterating
> over each thread in a threadgroup safely with respect to the invariants that
> will be used in cgroup_attach_proc. Also, subsystems whose can_attach calls
> require per-thread validation are modified to use the per_thread wrapper to
> avoid duplicating cgroup-internal code.
>
> This is a pre-patch for cgroup-procs-writable.patch.
>
> Signed-off-by: Ben Blum <[email protected]>
Acked-by: Paul Menage <[email protected]>
Some of the can_attach() methods could be simplified slightly by
directly returning the result of cgroup_can_attach_per_thread()
Paul
On Tue, Aug 10, 2010 at 10:47 PM, Ben Blum <[email protected]> wrote:
>
>
> Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
>
> From: Ben Blum <[email protected]>
>
> This patch adds an rwsem that lives in a threadgroup's signal_struct that's
> taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
> the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
> ifdefs should be changed to a higher-up flag that CGROUPS and the other system
> would both depend on.
>
> This is a pre-patch for cgroup-procs-write.patch.
>
> Signed-off-by: Ben Blum <[email protected]>
Acked-by: Paul Menage <[email protected]>
Paul
On Tue, Aug 10, 2010 at 10:48 PM, Ben Blum <[email protected]> wrote:
>
>
> Makes procs file writable to move all threads by tgid at once
>
> From: Ben Blum <[email protected]>
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a per-threadgroup rwsem that's
> taken for reading in the fork() path to prevent newly forking threads within
> the threadgroup from "escaping" while the move is in progress.
>
> Signed-off-by: Ben Blum <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> >
> > This patch series implements a write function for the 'cgroup.procs'
> > per-cgroup file, which enables atomic movement of multithreaded
> > applications between cgroups. Writing the thread-ID of any thread in a
> > threadgroup to a cgroup's procs file causes all threads in the group to
> > be moved to that cgroup safely with respect to threads forking/exiting.
> > (Possible usage scenario: If running a multithreaded build system that
> > sucks up system resources, this lets you restrict it all at once into a
> > new cgroup to keep it under control.)
> >
> > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> >
> > # cat /dev/cgroup/tasks
> > ...
> > 31337
> > 31338
> > 31339
> > # mkdir /dev/cgroup/foo
> > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > # cat /dev/cgroup/foo/tasks
> > 31337
> > 31338
> > 31339
> >
> > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > introduced to ensure atomicity when moving threads between cgroups. It's
> > taken for writing during the operation, and taking for reading in fork()
> > around the calls to cgroup_fork() and cgroup_post_fork(). I put calls to
> > down_read/up_read directly in copy_process(), since new inline functions
> > seemed like overkill.
> >
> > -- Ben
> >
> > ---
> > Documentation/cgroups/cgroups.txt | 13 -
> > include/linux/init_task.h | 9
> > include/linux/sched.h | 10
> > kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++-----
> > kernel/cgroup_freezer.c | 4
> > kernel/cpuset.c | 4
> > kernel/fork.c | 16 +
> > kernel/ns_cgroup.c | 4
> > kernel/sched.c | 4
> > 9 files changed, 440 insertions(+), 50 deletions(-)
>
> Here's an updated patchset. I've added an extra patch to implement the
> callback scheme Paul suggested (note how there are twice as many deleted
> lines of code as before :) ), and also moved the up_read/down_read calls
> to static inline functions in sched.h near the other threadgroup-related
> calls.
One more go at this. I've refreshed the patches for some conflicts in
cgroup_freezer.c, by adding an extra argument to the per_thread() call,
"need_rcu", which makes the function take rcu_read_lock even around the
single-task case (like freezer now requires). So no semantics have been
changed.
I also poked around at some attach() calls which also iterate over the
threadgroup (blkiocg_attach, cpuset_attach, cpu_cgroup_attach). I was
borderline about making another function, cgroup_attach_per_thread(),
but decided against.
There is a big issue in cpuset_attach, as explained in this email:
http://www.spinics.net/lists/linux-containers/msg22223.html
but the actual code/diffs for this patchset are independent of that
getting fixed, so I'm putting this up for consideration now.
-- Ben
---
Documentation/cgroups/cgroups.txt | 13 -
block/blk-cgroup.c | 31 ++
include/linux/cgroup.h | 14 +
include/linux/init_task.h | 9
include/linux/sched.h | 35 ++
kernel/cgroup.c | 469 ++++++++++++++++++++++++++++++++++----
kernel/cgroup_freezer.c | 33 +-
kernel/cpuset.c | 30 --
kernel/fork.c | 10
kernel/ns_cgroup.c | 25 --
kernel/sched.c | 24 -
11 files changed, 565 insertions(+), 128 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 35 +++++++++++++++++++++++++++++++++++
kernel/fork.c | 10 ++++++++++
3 files changed, 54 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6b281fa..b560381 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -31,6 +39,7 @@ extern struct fs_struct init_fs;
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8580dc6..213a0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -623,6 +623,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
@@ -2270,6 +2280,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+/* See the declaration of threadgroup_fork_lock in signal_struct. */
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->threadgroup_fork_lock);
+}
+#else
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+#endif
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0979527..aefe61f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -905,6 +905,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1087,6 +1091,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1294,6 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1332,6 +1340,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 13 +
kernel/cgroup.c | 424 +++++++++++++++++++++++++++++++++----
2 files changed, 387 insertions(+), 50 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 190018b..07674e5 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -426,6 +427,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
@@ -574,7 +581,9 @@ called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
attach() or cancel_attach() will be called in future. If threadgroup is
true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+thread's threadgroup can be moved together. If the subsystem wants to
+iterate over task->thread_group, it must take rcu_read_lock then check
+if thread_group_leader(task), returning -EAGAIN if that fails.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f86dd9c..74be02c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1771,6 +1771,76 @@ int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
}
EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exit. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1781,11 +1851,9 @@ EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1809,46 +1877,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1898,49 +1936,339 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /* check that we can legitimately attach to the cgroup. */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader, true);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ goto list_teardown;
+ }
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case has only one css_set for all of them, usually O(n). which ones
+ * we need allocated won't change as long as we hold cgroup_mutex.
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. we need to lock against possible
+ * races with fork(). note: we can safely take the threadgroup_fork_lock
+ * of leader since attach_task_by_pid took a reference.
+ * threadgroup_fork_lock must be taken outside of tasklist_lock to match
+ * the order in the fork path.
+ */
+ threadgroup_fork_write_lock(leader);
+ read_lock(&tasklist_lock);
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+ goto list_teardown;
+ }
+ /*
+ * No failure cases left, so this is the commit point.
+ *
+ * If the leader is already there, skip moving him. Note: even if the
+ * leader is PF_EXITING, we still move all other threads; if everybody
+ * is PF_EXITING, we end up doing nothing, which is ok.
+ */
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* Now iterate over each thread in the group. */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+
+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader, true);
+ }
+ /* holding these until here keeps us safe from exec() and fork(). */
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+
+ /*
+ * step 4: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out:
+ if (retval) {
+ /* same deal as in cgroup_attach_task, with threadgroup=true */
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader, true);
+ }
+ }
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * it is safe to find group_leader because tsk was found
+ * in the tid map, meaning it can't have been unhashed
+ * by someone in de_thread changing the leadership.
+ */
+ tsk = tsk->group_leader;
+ BUG_ON(!thread_group_leader(tsk));
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup)
+ ret = cgroup_attach_proc(cgrp, tsk);
+ else
+ ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3294,9 +3622,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
Add cgroup wrapper for safely calling can_attach on all threads in a threadgroup
From: Ben Blum <[email protected]>
This patch adds a function cgroup_can_attach_per_thread which handles iterating
over each thread in a threadgroup safely with respect to the invariants that
will be used in cgroup_attach_proc. Also, subsystems whose can_attach calls
require per-thread validation are modified to use the per_thread wrapper to
avoid duplicating cgroup-internal code.
This is a pre-patch for cgroup-procs-writable.patch.
Signed-off-by: Ben Blum <[email protected]>
---
block/blk-cgroup.c | 31 ++++++++++++++++++++++++++-----
include/linux/cgroup.h | 14 ++++++++++++++
kernel/cgroup.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
kernel/cgroup_freezer.c | 33 ++++++++++++++-------------------
kernel/cpuset.c | 30 ++++++++++--------------------
kernel/ns_cgroup.c | 25 +++++++++----------------
kernel/sched.c | 24 ++++++------------------
7 files changed, 124 insertions(+), 78 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0..865e208 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1475,9 +1475,7 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
- struct cgroup *cgroup, struct task_struct *tsk,
- bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
int ret = 0;
@@ -1492,10 +1490,17 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
return ret;
}
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
- struct cgroup *prev, struct task_struct *tsk,
+static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+ struct cgroup *cgroup, struct task_struct *tsk,
bool threadgroup)
{
+ return cgroup_can_attach_per_thread(cgroup, tsk,
+ blkiocg_can_attach_task,
+ threadgroup, false);
+}
+
+static void blkiocg_attach_task(struct task_struct *tsk)
+{
struct io_context *ioc;
task_lock(tsk);
@@ -1505,6 +1510,22 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
task_unlock(tsk);
}
+static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+ struct cgroup *prev, struct task_struct *tsk,
+ bool threadgroup)
+{
+ blkiocg_attach_task(tsk);
+ if (threadgroup) {
+ struct task_struct *c;
+
+ /* tasklist_lock will be held. */
+ BUG_ON(!thread_group_leader(tsk));
+ list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
+ blkiocg_attach_task(c);
+ }
+ }
+}
+
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
spin_lock(&blkio_list_lock);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..96898e6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -590,6 +590,20 @@ static inline int cgroup_attach_task_current_cg(struct task_struct *tsk)
}
/*
+ * For use in subsystems whose can_attach functions need to run an operation
+ * on every task in the threadgroup. Calls the given callback once if the
+ * 'threadgroup' flag is false, or once per thread in the group if true.
+ * The callback should return 0/-ERR; this will return 0/-ERR.
+ * The callback will run within an rcu_read section, so must not sleep.
+ * 'need_rcu' should specify whether the callback needs to run in an rcu_read
+ * section even in the single-task case.
+ */
+int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
+ int (*cb)(struct cgroup *cgrp,
+ struct task_struct *task),
+ bool threadgroup, bool need_rcu);
+
+/*
* CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
* if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
* CSS ID is assigned at cgroup allocation (create) automatically
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b..f86dd9c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1726,6 +1726,51 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+int cgroup_can_attach_per_thread(struct cgroup *cgrp, struct task_struct *task,
+ int (*cb)(struct cgroup *cgrp,
+ struct task_struct *task),
+ bool threadgroup, bool need_rcu)
+{
+ int ret;
+
+ /* Run callback on the leader first, taking rcu_read_lock if needed. */
+ if (need_rcu)
+ rcu_read_lock();
+
+ ret = cb(cgrp, task);
+
+ if (need_rcu)
+ rcu_read_unlock();
+
+ if (ret < 0)
+ return ret;
+
+ /* Run on each task in the threadgroup. */
+ if (threadgroup) {
+ struct task_struct *c;
+
+ rcu_read_lock();
+ /*
+ * It is necessary for the given task to still be the leader
+ * to safely traverse thread_group. See cgroup_attach_proc.
+ */
+ if (!thread_group_leader(task)) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ ret = cb(cgrp, c);
+ if (ret < 0) {
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+ rcu_read_unlock();
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_can_attach_per_thread);
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7..1f5ac8f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,
kfree(cgroup_freezer(cgroup));
}
+static int freezer_can_attach_cb(struct cgroup *cgrp, struct task_struct *task)
+{
+ if (__cgroup_freezing_or_frozen(task))
+ return -EBUSY;
+ return 0;
+}
+
/*
* The call to cgroup_lock() in the freezer.state write method prevents
* a write to that file racing against an attach, and hence the
@@ -163,6 +170,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
struct task_struct *task, bool threadgroup)
{
struct freezer *freezer;
+ int ret;
/*
* Anything frozen can't move or be moved to/from.
@@ -172,25 +180,12 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
- rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
+ /* Need to take rcu_read_lock even around the call on the leader. */
+ ret = cgroup_can_attach_per_thread(new_cgroup, task,
+ freezer_can_attach_cb, threadgroup,
+ true);
+ if (ret < 0)
+ return ret;
return 0;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935..8fbe1e3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1375,11 +1375,15 @@ static int fmeter_getrate(struct fmeter *fmp)
/* Protected by cgroup_lock */
static cpumask_var_t cpus_attach;
+static int cpuset_can_attach_cb(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct task_struct *tsk, bool threadgroup)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1396,23 +1400,8 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
- return 0;
+ return cgroup_can_attach_per_thread(cont, tsk, cpuset_can_attach_cb,
+ threadgroup, false);
}
static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
@@ -1455,11 +1444,12 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
cpuset_attach_task(tsk, to, cs);
if (threadgroup) {
struct task_struct *c;
- rcu_read_lock();
+
+ /* tasklist_lock will be held. */
+ BUG_ON(!thread_group_leader(tsk));
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
cpuset_attach_task(c, to, cs);
}
- rcu_read_unlock();
}
/* change mm; only needs to be done once even if threadgroup */
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2c98ad9..66ba860 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,6 +42,13 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* (hence either you are in the same cgroup as task, or in an
* ancestor cgroup thereof)
*/
+static int ns_can_attach_cb(struct cgroup *new_cgroup, struct task_struct *task)
+{
+ if (!cgroup_is_descendant(new_cgroup, task))
+ return -EPERM;
+ return 0;
+}
+
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
struct task_struct *task, bool threadgroup)
{
@@ -53,22 +60,8 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
return -EPERM;
}
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
-
- return 0;
+ return cgroup_can_attach_per_thread(new_cgroup, task, ns_can_attach_cb,
+ threadgroup, false);
}
/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 218ef20..8e89bf9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8659,22 +8659,9 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk, bool threadgroup)
{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
+ return cgroup_can_attach_per_thread(cgrp, tsk,
+ cpu_cgroup_can_attach_task,
+ threadgroup, false);
}
static void
@@ -8685,11 +8672,12 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
sched_move_task(tsk);
if (threadgroup) {
struct task_struct *c;
- rcu_read_lock();
+
+ /* tasklist_lock will be held. */
+ BUG_ON(!thread_group_leader(tsk));
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
sched_move_task(c);
}
- rcu_read_unlock();
}
}
On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > >
> > > This patch series implements a write function for the 'cgroup.procs'
> > > per-cgroup file, which enables atomic movement of multithreaded
> > > applications between cgroups. Writing the thread-ID of any thread in a
> > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > (Possible usage scenario: If running a multithreaded build system that
> > > sucks up system resources, this lets you restrict it all at once into a
> > > new cgroup to keep it under control.)
> > >
> > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > >
> > > # cat /dev/cgroup/tasks
> > > ...
> > > 31337
> > > 31338
> > > 31339
> > > # mkdir /dev/cgroup/foo
> > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > # cat /dev/cgroup/foo/tasks
> > > 31337
> > > 31338
> > > 31339
> > >
> > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > taken for writing during the operation, and taking for reading in fork()
> > > around the calls to cgroup_fork() and cgroup_post_fork(). I put calls to
> > > down_read/up_read directly in copy_process(), since new inline functions
> > > seemed like overkill.
> > >
> > > -- Ben
> > >
> > > ---
> > > Documentation/cgroups/cgroups.txt | 13 -
> > > include/linux/init_task.h | 9
> > > include/linux/sched.h | 10
> > > kernel/cgroup.c | 426 +++++++++++++++++++++++++++++++++-----
> > > kernel/cgroup_freezer.c | 4
> > > kernel/cpuset.c | 4
> > > kernel/fork.c | 16 +
> > > kernel/ns_cgroup.c | 4
> > > kernel/sched.c | 4
> > > 9 files changed, 440 insertions(+), 50 deletions(-)
> >
> > Here's an updated patchset. I've added an extra patch to implement the
> > callback scheme Paul suggested (note how there are twice as many deleted
> > lines of code as before :) ), and also moved the up_read/down_read calls
> > to static inline functions in sched.h near the other threadgroup-related
> > calls.
>
> One more go at this. I've refreshed the patches for some conflicts in
> cgroup_freezer.c, by adding an extra argument to the per_thread() call,
> "need_rcu", which makes the function take rcu_read_lock even around the
> single-task case (like freezer now requires). So no semantics have been
> changed.
>
> I also poked around at some attach() calls which also iterate over the
> threadgroup (blkiocg_attach, cpuset_attach, cpu_cgroup_attach). I was
> borderline about making another function, cgroup_attach_per_thread(),
> but decided against.
>
> There is a big issue in cpuset_attach, as explained in this email:
> http://www.spinics.net/lists/linux-containers/msg22223.html
> but the actual code/diffs for this patchset are independent of that
> getting fixed, so I'm putting this up for consideration now.
>
> -- Ben
Well this time everything here is actually safe and correct, as far as
my best efforts and keen eyes can tell. I dropped the per_thread call
from the last series in favour of revising the subsystem callback
interface. It now looks like this:
ss->can_attach()
- Thread-independent, possibly expensive/sleeping.
ss->can_attach_task()
- Called per-thread, run with rcu_read so must not sleep.
ss->pre_attach()
- Thread independent, must be atomic, happens before attach_task.
ss->attach_task()
- Called per-thread, run with tasklist_lock so must not sleep.
ss->attach()
- Thread independent, possibly expensive/sleeping, called last.
I think this makes the most sense, since it keeps all of the threadgroup
logic confined to cgroup_attach_proc, and also by splitting up the
callbacks, many subsystems get to have less code about stuff they don't
need to worry about. It makes the issue mentioned here:
http://www.spinics.net/lists/linux-containers/msg22236.html decoupled
from this patchset (since mmap_sem stuff is done in thread-independent
callbacks, and also fixes (this particular case of) this problem:
http://www.spinics.net/lists/linux-containers/msg22223.html (by using
global nodemasks for the three attach callbacks).
One final bullet to dodge: cpuset_change_task_nodemask() is implemented
using a loop around yield() to synchronize the mems_allowed, so it can't
be used in the atomic attach_task(). (It looks like a total mess to me -
can anybody justify why it was done that way, instead of using a better
concurrency primitive?) Rather than dirty my hands by changing any of
it, I just moved it out of the per-thread function - explained more in
the second patch. If it gets rewritten to avoid yielding, it can be
moved back to attach_task (I left a TODO).
Other than that, a quick review of why everything here is safe:
- Iterating over the thread_group is done only under rcu_read_lock or
tasklist_lock, always checking first that thread_group_leader(task).
(And, a reference is held on that task the whole time.)
- All allocation is done outside of rcu_read/tasklist_lock.
- All subsystem callbacks for can_attach_task() and attach_task() never
call any function that can block or otherwise yield.
(It'd be really nice if the functions that might sleep and regions of
code that must not sleep could be checked for automatically at build.)
-- Ben
---
Documentation/cgroups/cgroups.txt | 44 ++-
Documentation/cgroups/cpusets.txt | 9
block/blk-cgroup.c | 18 -
include/linux/cgroup.h | 10
include/linux/init_task.h | 9
include/linux/sched.h | 35 ++
kernel/cgroup.c | 489 ++++++++++++++++++++++++++++++++++----
kernel/cgroup_freezer.c | 27 --
kernel/cpuset.c | 116 ++++-----
kernel/fork.c | 10
kernel/ns_cgroup.c | 23 -
kernel/sched.c | 38 --
mm/memcontrol.c | 18 -
security/device_cgroup.c | 3
14 files changed, 635 insertions(+), 214 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 35 +++++++++++++++++++++++++++++++++++
kernel/fork.c | 10 ++++++++++
3 files changed, 54 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6b281fa..b560381 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -31,6 +39,7 @@ extern struct fs_struct init_fs;
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8580dc6..213a0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -623,6 +623,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
@@ -2270,6 +2280,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+/* See the declaration of threadgroup_fork_lock in signal_struct. */
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->threadgroup_fork_lock);
+}
+#else
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+#endif
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0979527..aefe61f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -905,6 +905,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1087,6 +1091,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1294,6 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1332,6 +1340,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Add cgroup subsystem callbacks for per-thread attachment in atomic contexts
From: Ben Blum <[email protected]>
This patch adds can_attach_task, pre_attach, and attach_task as new callbacks
for cgroups's subsystem interface. Unlike can_attach and attach, these are for
per-thread operations, to be called potentially many times when attaching an
entire threadgroup, and may run under rcu_read/tasklist_lock, so are for quick
operations only.
Also, the old "bool threadgroup" interface is removed, as replaced by this.
All subsystems are modified for the new interface - of note is cpuset, which
requires from/to nodemasks for attach to be globally scoped (though per-cpuset
would work too) to persist from its pre_attach to attach_task and attach.
This is a pre-patch for cgroup-procs-writable.patch.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 35 ++++++++---
Documentation/cgroups/cpusets.txt | 9 +++
block/blk-cgroup.c | 18 ++----
include/linux/cgroup.h | 10 ++-
kernel/cgroup.c | 17 ++++-
kernel/cgroup_freezer.c | 27 ++++-----
kernel/cpuset.c | 116 +++++++++++++++++++------------------
kernel/ns_cgroup.c | 23 +++----
kernel/sched.c | 38 +-----------
mm/memcontrol.c | 18 ++----
security/device_cgroup.c | 3 -
11 files changed, 149 insertions(+), 165 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 190018b..341ed44 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -563,7 +563,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
called multiple times against a cgroup.
int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
(cgroup_mutex held by caller)
Called prior to moving a task into a cgroup; if the subsystem
@@ -572,9 +572,15 @@ task is passed, then a successful result indicates that *any*
unspecified task can be moved into the cgroup. Note that this isn't
called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future. If threadgroup is
-true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+attach() or cancel_attach() will be called in future.
+
+int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex and rcu_read_lock held by caller)
+
+As can_attach, but for operations that must be run once per task to be
+attached (possibly many when using cgroup_attach_proc). This may run in
+rcu_read-side, so sleeping is not permitted. Expensive operations, such as
+dealing with the shared mm, should run in can_attach.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
@@ -587,15 +593,26 @@ This will be called only about subsystems whose can_attach() operation have
succeeded.
void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *old_cgrp, struct task_struct *task)
(cgroup_mutex held by caller)
Called after the task has been attached to the cgroup, to allow any
post-attachment activity that requires memory allocations or blocking.
-If threadgroup is true, the subsystem should take care of all threads
-in the specified thread's threadgroup. Currently does not support any
-subsystem that might need the old_cgrp for every thread in the group.
+
+void pre_attach(struct cgroup *cgrp);
+(cgroup_mutex and tasklist_lock held by caller)
+
+See description of attach_task.
+
+void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex and possibly tasklist_lock held by caller)
+
+As attach, but for operations that must be run once per task to be attached,
+like can_attach_task. Sometimes called with tasklist_lock taken for reading,
+so may not sleep. Currently does not support any subsystem that might need the
+old_cgrp for every thread in the group. Note: unlike can_attach_task, this
+runs before attach, so use pre_attach for non-per-thread operations that must
+happen before attach_task.
void fork(struct cgroup_subsy *ss, struct task_struct *task)
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 5d0d569..1f0868d 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -659,6 +659,15 @@ the current task's cpuset, then we relax the cpuset, and look for
memory anywhere we can find it. It's better to violate the cpuset
than stress the kernel.
+There is a third exception to the above. When using the cgroup.procs file
+to move all tasks in a threadgroup at once, the per-task attachment code
+must run in an atomic context, but as currently implemented, changing the
+nodemasks for a task's memory policy may need to deschedule. So, in this
+case, the best cpusets can do is change the nodemask for the threadgroup
+leader when attaching. Thus, a multithreaded mempolicy user should first
+use cgroup.procs (for correctness), but also next use the tasks file for
+each thread in the group to ensure updating the nodemasks for all of them.
+
To start a new job that is to be contained within a cpuset, the steps are:
1) mkdir /dev/cpuset
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0..45b3809 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
- struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
- struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach = blkiocg_can_attach,
- .attach = blkiocg_attach,
+ .can_attach_task = blkiocg_can_attach_task,
+ .attach_task = blkiocg_attach_task,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
@@ -1475,9 +1473,7 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
- struct cgroup *cgroup, struct task_struct *tsk,
- bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
int ret = 0;
@@ -1492,9 +1488,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
return ret;
}
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
- struct cgroup *prev, struct task_struct *tsk,
- bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..35b69b4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -467,12 +467,14 @@ struct cgroup_subsys {
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ void (*pre_attach)(struct cgroup *cgrp);
+ void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *tsk,
- bool threadgroup);
+ struct cgroup *old_cgrp, struct task_struct *tsk);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b..616f27a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1750,7 +1750,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1762,6 +1762,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
task_lock(tsk);
@@ -1798,8 +1805,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
@@ -1822,7 +1833,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7..e6ee70c 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,18 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+ /* rcu_read_lock allows recursive locking */
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +382,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935..b9fce80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1372,14 +1372,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1396,29 +1392,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Do quick set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1426,56 +1435,46 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
cpuset_update_task_spread_flag(cs, tsk);
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
- if (from == NULL || to == NULL)
- goto alloc_fail;
-
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
- }
- rcu_read_unlock();
- }
+ /*
+ * TODO: As implemented, change_task_nodemask uses yield() to
+ * synchronize with other users of the mems_allowed, which is not
+ * allowed in the atomic attach_task callback, so we can't do this for
+ * each thread in the multithreaded case. This is a performance issue,
+ * but not a correctness one.
+ *
+ * As long as change_task_nodemask can yield, a multithreaded mempolicy
+ * user should attach to a cgroup by threadgroup first (for
+ * correctness) then poke each task to get its mempolicy right.
+ *
+ * This is the "third exception" in Documentation/cgroups/cpusets.txt.
+ */
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
- /* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
@@ -1928,6 +1927,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2c98ad9..1fc2b1b 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -43,7 +43,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* ancestor cgroup thereof)
*/
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
@@ -53,21 +53,13 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
return -EPERM;
}
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
+ return 0;
+}
+static int ns_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+ if (!cgroup_is_descendant(cgrp, tsk))
+ return -EPERM;
return 0;
}
@@ -112,6 +104,7 @@ static void ns_destroy(struct cgroup_subsys *ss,
struct cgroup_subsys ns_subsys = {
.name = "ns",
.can_attach = ns_can_attach,
+ .can_attach_task = ns_can_attach_task,
.create = ns_create,
.destroy = ns_destroy,
.subsys_id = ns_subsys_id,
diff --git a/kernel/sched.c b/kernel/sched.c
index 218ef20..d619f1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8655,42 +8655,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8763,8 +8731,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 729beb7..995f0b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4720,8 +4720,7 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4775,8 +4774,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
mem_cgroup_clear_mc();
}
@@ -4880,8 +4878,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
if (!mc.mm)
/* no need to move charge */
@@ -4893,22 +4890,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
#endif
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8d9c48f..cd1f779 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
struct cgroup_subsys devices_subsys;
static int devcgroup_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *new_cgroup, struct task_struct *task)
{
if (current != task && !capable(CAP_SYS_ADMIN))
return -EPERM;
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 9 +
kernel/cgroup.c | 472 +++++++++++++++++++++++++++++++++----
2 files changed, 432 insertions(+), 49 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 341ed44..9157e75 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -426,6 +427,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 616f27a..9361c44 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1726,6 +1726,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exit. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1736,11 +1806,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1771,38 +1839,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->pre_attach)
@@ -1812,9 +1851,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1864,49 +1902,387 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /* check that we can legitimately attach to the cgroup. */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run callback on the leader first. */
+ retval = ss->can_attach_task(cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+
+ /* run on each task in the threadgroup. */
+ rcu_read_lock();
+ /* sanity check - racing de_thread may cause this. */
+ if (!thread_group_leader(leader)) {
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ failed_ss = ss;
+ goto out;
+ }
+ list_for_each_entry_rcu(tsk, &leader->thread_group,
+ thread_group) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ rcu_read_unlock();
+ failed_ss = ss;
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+ }
+ }
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ goto list_teardown;
+ }
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case has only one css_set for all of them, usually O(n). which ones
+ * we need allocated won't change as long as we hold cgroup_mutex.
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. we need to lock against possible
+ * races with fork(). note: we can safely take the threadgroup_fork_lock
+ * of leader since attach_task_by_pid took a reference.
+ * threadgroup_fork_lock must be taken outside of tasklist_lock to match
+ * the order in the fork path.
+ */
+ threadgroup_fork_write_lock(leader);
+ read_lock(&tasklist_lock);
+ /* sanity check - if we raced with de_thread, we must abort */
+ if (!thread_group_leader(leader)) {
+ retval = -EAGAIN;
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+ goto list_teardown;
+ }
+ /*
+ * No failure cases left, so this is the commit point.
+ *
+ * Start by calling pre_attach for each subsystem.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ /*
+ * Move each thread, calling ss->attach_task for each one along the way.
+ *
+ * If the leader is already there, skip moving him. Note: even if the
+ * leader is PF_EXITING, we still move all other threads; if everybody
+ * is PF_EXITING, we end up doing nothing, which is ok.
+ */
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* attach the leader */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, leader);
+ }
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* Now iterate over each thread in the group. */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() or exec() after this point. */
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_write_unlock(leader);
+
+ /*
+ * step 3: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 4: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out:
+ if (retval) {
+ /* same deal as in cgroup_attach_task */
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * it is safe to find group_leader because tsk was found
+ * in the tid map, meaning it can't have been unhashed
+ * by someone in de_thread changing the leadership.
+ */
+ tsk = tsk->group_leader;
+ BUG_ON(!thread_group_leader(tsk));
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup)
+ ret = cgroup_attach_proc(cgrp, tsk);
+ else
+ ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3260,9 +3636,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
On Mon, Jan 24, 2011 at 01:05:29PM -0800, Andrew Morton wrote:
> On Sun, 26 Dec 2010 07:09:51 -0500
> Ben Blum <[email protected]> wrote:
>
> > Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
> >
> > From: Ben Blum <[email protected]>
> >
> > This patch adds an rwsem that lives in a threadgroup's signal_struct that's
> > taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
> > the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
> > ifdefs should be changed to a higher-up flag that CGROUPS and the other system
> > would both depend on.
> >
> > This is a pre-patch for cgroup-procs-write.patch.
> >
> > ...
> >
> > +/* See the declaration of threadgroup_fork_lock in signal_struct. */
> > +#ifdef CONFIG_CGROUPS
> > +static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
> > +{
> > + down_read(&tsk->signal->threadgroup_fork_lock);
> > +}
> > +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
> > +{
> > + up_read(&tsk->signal->threadgroup_fork_lock);
> > +}
> > +static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
> > +{
> > + down_write(&tsk->signal->threadgroup_fork_lock);
> > +}
> > +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
> > +{
> > + up_write(&tsk->signal->threadgroup_fork_lock);
> > +}
> > +#else
>
> Risky. sched.h doesn't include rwsem.h.
>
> We could make it do so, but almost every compilation unit in the kernel
> includes sched.h. It would be nicer to make the kernel build
> finer-grained, rather than blunter-grained. Don't be afraid to add new
> header files if that is one way of doing this!
Hmm, good point. But there's also:
+#ifdef CONFIG_CGROUPS
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
in the signal_struct, also in sched.h, which needs to be there. Or I
could change it to a struct pointer with a forward incomplete
declaration above, and kmalloc/kfree it? I don't like adding more
alloc/free calls but don't know if it's more or less important than
header granularity.
-- Ben
On Fri, 4 Feb 2011 16:25:15 -0500
Ben Blum <[email protected]> wrote:
> On Mon, Jan 24, 2011 at 01:05:29PM -0800, Andrew Morton wrote:
> > On Sun, 26 Dec 2010 07:09:51 -0500
> > Ben Blum <[email protected]> wrote:
> >
> > > Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
> > >
> > > From: Ben Blum <[email protected]>
> > >
> > > This patch adds an rwsem that lives in a threadgroup's signal_struct that's
> > > taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
> > > the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
> > > ifdefs should be changed to a higher-up flag that CGROUPS and the other system
> > > would both depend on.
> > >
> > > This is a pre-patch for cgroup-procs-write.patch.
> > >
> > > ...
> > >
> > > +/* See the declaration of threadgroup_fork_lock in signal_struct. */
> > > +#ifdef CONFIG_CGROUPS
> > > +static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
> > > +{
> > > + down_read(&tsk->signal->threadgroup_fork_lock);
> > > +}
> > > +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
> > > +{
> > > + up_read(&tsk->signal->threadgroup_fork_lock);
> > > +}
> > > +static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
> > > +{
> > > + down_write(&tsk->signal->threadgroup_fork_lock);
> > > +}
> > > +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
> > > +{
> > > + up_write(&tsk->signal->threadgroup_fork_lock);
> > > +}
> > > +#else
> >
> > Risky. sched.h doesn't include rwsem.h.
> >
> > We could make it do so, but almost every compilation unit in the kernel
> > includes sched.h. It would be nicer to make the kernel build
> > finer-grained, rather than blunter-grained. Don't be afraid to add new
> > header files if that is one way of doing this!
>
> Hmm, good point. But there's also:
>
> +#ifdef CONFIG_CGROUPS
> + struct rw_semaphore threadgroup_fork_lock;
> +#endif
>
> in the signal_struct, also in sched.h, which needs to be there. Or I
> could change it to a struct pointer with a forward incomplete
> declaration above, and kmalloc/kfree it? I don't like adding more
> alloc/free calls but don't know if it's more or less important than
> header granularity.
What about adding a new header file which includes rwsem.h and sched.h
and then defines the new interfaces?
On Fri, Feb 04, 2011 at 01:36:57PM -0800, Andrew Morton wrote:
> On Fri, 4 Feb 2011 16:25:15 -0500
> Ben Blum <[email protected]> wrote:
>
> > On Mon, Jan 24, 2011 at 01:05:29PM -0800, Andrew Morton wrote:
> > > On Sun, 26 Dec 2010 07:09:51 -0500
> > > Ben Blum <[email protected]> wrote:
> > >
> > > > Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
> > > >
> > > > From: Ben Blum <[email protected]>
> > > >
> > > > This patch adds an rwsem that lives in a threadgroup's signal_struct that's
> > > > taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
> > > > the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
> > > > ifdefs should be changed to a higher-up flag that CGROUPS and the other system
> > > > would both depend on.
> > > >
> > > > This is a pre-patch for cgroup-procs-write.patch.
> > > >
> > > > ...
> > > >
> > > > +/* See the declaration of threadgroup_fork_lock in signal_struct. */
> > > > +#ifdef CONFIG_CGROUPS
> > > > +static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
> > > > +{
> > > > + down_read(&tsk->signal->threadgroup_fork_lock);
> > > > +}
> > > > +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
> > > > +{
> > > > + up_read(&tsk->signal->threadgroup_fork_lock);
> > > > +}
> > > > +static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
> > > > +{
> > > > + down_write(&tsk->signal->threadgroup_fork_lock);
> > > > +}
> > > > +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
> > > > +{
> > > > + up_write(&tsk->signal->threadgroup_fork_lock);
> > > > +}
> > > > +#else
> > >
> > > Risky. sched.h doesn't include rwsem.h.
> > >
> > > We could make it do so, but almost every compilation unit in the kernel
> > > includes sched.h. It would be nicer to make the kernel build
> > > finer-grained, rather than blunter-grained. Don't be afraid to add new
> > > header files if that is one way of doing this!
> >
> > Hmm, good point. But there's also:
> >
> > +#ifdef CONFIG_CGROUPS
> > + struct rw_semaphore threadgroup_fork_lock;
> > +#endif
> >
> > in the signal_struct, also in sched.h, which needs to be there. Or I
> > could change it to a struct pointer with a forward incomplete
> > declaration above, and kmalloc/kfree it? I don't like adding more
> > alloc/free calls but don't know if it's more or less important than
> > header granularity.
>
> What about adding a new header file which includes rwsem.h and sched.h
> and then defines the new interfaces?
Er, I mean the definition of signal_struct needs rwsem.h as well, not
just the threadgroup_fork_* functions. (And I suspect moving
signal_struct somewhere else would give bigger problems...)
On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
> On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > > >
> > > > This patch series implements a write function for the 'cgroup.procs'
> > > > per-cgroup file, which enables atomic movement of multithreaded
> > > > applications between cgroups. Writing the thread-ID of any thread in a
> > > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > > (Possible usage scenario: If running a multithreaded build system that
> > > > sucks up system resources, this lets you restrict it all at once into a
> > > > new cgroup to keep it under control.)
> > > >
> > > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > > >
> > > > # cat /dev/cgroup/tasks
> > > > ...
> > > > 31337
> > > > 31338
> > > > 31339
> > > > # mkdir /dev/cgroup/foo
> > > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > > # cat /dev/cgroup/foo/tasks
> > > > 31337
> > > > 31338
> > > > 31339
> > > >
> > > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > > taken for writing during the operation, and taking for reading in fork()
> > > > around the calls to cgroup_fork() and cgroup_post_fork().
>
> Well this time everything here is actually safe and correct, as far as
> my best efforts and keen eyes can tell. I dropped the per_thread call
> from the last series in favour of revising the subsystem callback
> interface. It now looks like this:
>
> ss->can_attach()
> - Thread-independent, possibly expensive/sleeping.
>
> ss->can_attach_task()
> - Called per-thread, run with rcu_read so must not sleep.
>
> ss->pre_attach()
> - Thread independent, must be atomic, happens before attach_task.
>
> ss->attach_task()
> - Called per-thread, run with tasklist_lock so must not sleep.
>
> ss->attach()
> - Thread independent, possibly expensive/sleeping, called last.
Okay, so.
I've revamped the cgroup_attach_proc implementation a bunch and this
version should be a lot easier on the eyes (and brains). Issues that are
addressed:
1) cgroup_attach_proc now iterates over leader->thread_group once, at
the very beginning, and puts each task_struct that we want to move
into an array, using get_task_struct to make sure they stick around.
- threadgroup_fork_lock ensures no threads not in the array can
appear, and allows us to use signal->nr_threads to determine the
size of the array when kmallocing it.
- This simplifies the rest of the function a bunch, since now we
never need to do rcu_read_lock after building the array. All the
subsystem callbacks are the same as described just above, but the
"can't sleep" restriction is gone, so it's nice and clean.
- Checking for a race with de_thread (the manoeuvre I refer to as
"double-double-toil-and-trouble-check locking") now needs to be
done only once, at the beginning (before building the array).
2) The nodemask allocation problem in cpuset is fixed the same way as
before - the masks are shared between the three attach callbacks, so
are made as static global variables.
3) The introduction of threadgroup_fork_lock in sched.h (specifically,
in signal_struct) requires rwsem.h; the new include appears in the
first patch. (An alternate plan would be to make it a struct pointer
with an incomplete forward declaration and kmalloc/kfree it during
housekeeping, but adding an include seems better than that particular
complication.) In light of this, the definitions for
threadgroup_fork_{read,write}_{un,}lock are also in sched.h.
-- Ben
---
Documentation/cgroups/cgroups.txt | 39 ++-
block/blk-cgroup.c | 18 -
include/linux/cgroup.h | 10
include/linux/init_task.h | 9
include/linux/sched.h | 37 +++
kernel/cgroup.c | 454 +++++++++++++++++++++++++++++++++-----
kernel/cgroup_freezer.c | 26 --
kernel/cpuset.c | 105 +++-----
kernel/fork.c | 10
kernel/ns_cgroup.c | 23 -
kernel/sched.c | 38 ---
mm/memcontrol.c | 18 -
security/device_cgroup.c | 3
13 files changed, 575 insertions(+), 215 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 37 +++++++++++++++++++++++++++++++++++++
kernel/fork.c | 10 ++++++++++
3 files changed, 56 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6b281fa..b560381 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -31,6 +39,7 @@ extern struct fs_struct init_fs;
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8580dc6..2fdbeb1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -509,6 +509,8 @@ struct thread_group_cputimer {
spinlock_t lock;
};
+#include <linux/rwsem.h>
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -623,6 +625,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
@@ -2270,6 +2282,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+/* See the declaration of threadgroup_fork_lock in signal_struct. */
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->threadgroup_fork_lock);
+}
+#else
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+#endif
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/kernel/fork.c b/kernel/fork.c
index 0979527..aefe61f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -905,6 +905,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1087,6 +1091,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1294,6 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1332,6 +1340,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Add cgroup subsystem callbacks for per-thread attachment
From: Ben Blum <[email protected]>
This patch adds can_attach_task, pre_attach, and attach_task as new callbacks
for cgroups's subsystem interface. Unlike can_attach and attach, these are for
per-thread operations, to be called potentially many times when attaching an
entire threadgroup.
Also, the old "bool threadgroup" interface is removed, as replaced by this.
All subsystems are modified for the new interface - of note is cpuset, which
requires from/to nodemasks for attach to be globally scoped (though per-cpuset
would work too) to persist from its pre_attach to attach_task and attach.
This is a pre-patch for cgroup-procs-writable.patch.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 30 ++++++++---
block/blk-cgroup.c | 18 ++----
include/linux/cgroup.h | 10 ++--
kernel/cgroup.c | 17 +++++-
kernel/cgroup_freezer.c | 26 ++++-----
kernel/cpuset.c | 105 ++++++++++++++++---------------------
kernel/ns_cgroup.c | 23 +++-----
kernel/sched.c | 38 +------------
mm/memcontrol.c | 18 ++----
security/device_cgroup.c | 3 -
10 files changed, 122 insertions(+), 166 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 190018b..d3c9a24 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -563,7 +563,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
called multiple times against a cgroup.
int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
(cgroup_mutex held by caller)
Called prior to moving a task into a cgroup; if the subsystem
@@ -572,9 +572,14 @@ task is passed, then a successful result indicates that *any*
unspecified task can be moved into the cgroup. Note that this isn't
called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future. If threadgroup is
-true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+attach() or cancel_attach() will be called in future.
+
+int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As can_attach, but for operations that must be run once per task to be
+attached (possibly many when using cgroup_attach_proc). Called after
+can_attach.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
@@ -586,15 +591,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
This will be called only about subsystems whose can_attach() operation have
succeeded.
+void pre_attach(struct cgroup *cgrp);
+(cgroup_mutex held by caller)
+
+For any non-per-thread attachment work that needs to happen before
+attach_task. Needed by cpuset.
+
void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *old_cgrp, struct task_struct *task)
(cgroup_mutex held by caller)
Called after the task has been attached to the cgroup, to allow any
post-attachment activity that requires memory allocations or blocking.
-If threadgroup is true, the subsystem should take care of all threads
-in the specified thread's threadgroup. Currently does not support any
+
+void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As attach, but for operations that must be run once per task to be attached,
+like can_attach_task. Called before attach. Currently does not support any
subsystem that might need the old_cgrp for every thread in the group.
void fork(struct cgroup_subsy *ss, struct task_struct *task)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b1febd0..45b3809 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
- struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
- struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach = blkiocg_can_attach,
- .attach = blkiocg_attach,
+ .can_attach_task = blkiocg_can_attach_task,
+ .attach_task = blkiocg_attach_task,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
@@ -1475,9 +1473,7 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
- struct cgroup *cgroup, struct task_struct *tsk,
- bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
int ret = 0;
@@ -1492,9 +1488,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
return ret;
}
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
- struct cgroup *prev, struct task_struct *tsk,
- bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ce104e3..35b69b4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -467,12 +467,14 @@ struct cgroup_subsys {
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ void (*pre_attach)(struct cgroup *cgrp);
+ void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *tsk,
- bool threadgroup);
+ struct cgroup *old_cgrp, struct task_struct *tsk);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66a416b..616f27a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1750,7 +1750,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1762,6 +1762,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
task_lock(tsk);
@@ -1798,8 +1805,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
@@ -1822,7 +1833,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7..e691818 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935..5f71ca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1372,14 +1372,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1396,29 +1392,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1426,56 +1435,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, tsk);
-
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (from == NULL || to == NULL)
- goto alloc_fail;
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
- }
- rcu_read_unlock();
- }
-
- /* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
@@ -1928,6 +1912,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2c98ad9..1fc2b1b 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -43,7 +43,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
* ancestor cgroup thereof)
*/
static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
@@ -53,21 +53,13 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
return -EPERM;
}
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
+ return 0;
+}
+static int ns_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+ if (!cgroup_is_descendant(cgrp, tsk))
+ return -EPERM;
return 0;
}
@@ -112,6 +104,7 @@ static void ns_destroy(struct cgroup_subsys *ss,
struct cgroup_subsys ns_subsys = {
.name = "ns",
.can_attach = ns_can_attach,
+ .can_attach_task = ns_can_attach_task,
.create = ns_create,
.destroy = ns_destroy,
.subsys_id = ns_subsys_id,
diff --git a/kernel/sched.c b/kernel/sched.c
index 218ef20..d619f1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8655,42 +8655,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8763,8 +8731,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 729beb7..995f0b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4720,8 +4720,7 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4775,8 +4774,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
mem_cgroup_clear_mc();
}
@@ -4880,8 +4878,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
if (!mc.mm)
/* no need to move charge */
@@ -4893,22 +4890,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
#endif
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8d9c48f..cd1f779 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
struct cgroup_subsys devices_subsys;
static int devcgroup_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *new_cgroup, struct task_struct *task)
{
if (current != task && !capable(CAP_SYS_ADMIN))
return -EPERM;
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 9 +
kernel/cgroup.c | 437 +++++++++++++++++++++++++++++++++----
2 files changed, 397 insertions(+), 49 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index d3c9a24..92d93d6 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -426,6 +427,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 616f27a..58b364a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1726,6 +1726,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1736,11 +1806,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1771,38 +1839,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->pre_attach)
@@ -1812,9 +1851,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1864,49 +1902,352 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct task_struct **group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ group[i] = tsk;
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ BUG_ON(i == 0);
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ retval = ss->can_attach_task(cgrp, group[i]);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss)
+ break;
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++)
+ put_task_struct(group[i]);
+out_free_group_list:
+ kfree(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * it is safe to find group_leader because tsk was found
+ * in the tid map, meaning it can't have been unhashed
+ * by someone in de_thread changing the leadership.
+ */
+ tsk = tsk->group_leader;
+ BUG_ON(!thread_group_leader(tsk));
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3260,9 +3601,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
On Mon, 7 Feb 2011 20:35:42 -0500
Ben Blum <[email protected]> wrote:
> On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
> > On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> > > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > > > >
> > > > > This patch series implements a write function for the 'cgroup.procs'
> > > > > per-cgroup file, which enables atomic movement of multithreaded
> > > > > applications between cgroups. Writing the thread-ID of any thread in a
> > > > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > > > (Possible usage scenario: If running a multithreaded build system that
> > > > > sucks up system resources, this lets you restrict it all at once into a
> > > > > new cgroup to keep it under control.)
> > > > >
> > > > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > > > >
> > > > > # cat /dev/cgroup/tasks
> > > > > ...
> > > > > 31337
> > > > > 31338
> > > > > 31339
> > > > > # mkdir /dev/cgroup/foo
> > > > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > > > # cat /dev/cgroup/foo/tasks
> > > > > 31337
> > > > > 31338
> > > > > 31339
> > > > >
> > > > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > > > taken for writing during the operation, and taking for reading in fork()
> > > > > around the calls to cgroup_fork() and cgroup_post_fork().
The above six month old text is the best (and almost the only)
explanation of the rationale for the entire patch series. Is
it still correct and complete?
Assuming "yes", then... how do we determine whether the feature is
sufficiently useful to justify merging and maintaining it? Will people
actually use it?
Was there some particular operational situation which led you to think
that the kernel should have this capability? If so, please help us out here
and lavishly describe it.
On Wed, 9 Feb 2011 15:10:46 -0800
Andrew Morton <[email protected]> wrote:
> On Mon, 7 Feb 2011 20:35:42 -0500
> Ben Blum <[email protected]> wrote:
>
> > On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
> > > On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> > > > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > > > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > > > > >
> > > > > > This patch series implements a write function for the 'cgroup.procs'
> > > > > > per-cgroup file, which enables atomic movement of multithreaded
> > > > > > applications between cgroups. Writing the thread-ID of any thread in a
> > > > > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > > > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > > > > (Possible usage scenario: If running a multithreaded build system that
> > > > > > sucks up system resources, this lets you restrict it all at once into a
> > > > > > new cgroup to keep it under control.)
> > > > > >
> > > > > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > > > > >
> > > > > > # cat /dev/cgroup/tasks
> > > > > > ...
> > > > > > 31337
> > > > > > 31338
> > > > > > 31339
> > > > > > # mkdir /dev/cgroup/foo
> > > > > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > > > > # cat /dev/cgroup/foo/tasks
> > > > > > 31337
> > > > > > 31338
> > > > > > 31339
> > > > > >
> > > > > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > > > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > > > > taken for writing during the operation, and taking for reading in fork()
> > > > > > around the calls to cgroup_fork() and cgroup_post_fork().
>
> The above six month old text is the best (and almost the only)
> explanation of the rationale for the entire patch series. Is
> it still correct and complete?
>
>
> Assuming "yes", then... how do we determine whether the feature is
> sufficiently useful to justify merging and maintaining it? Will people
> actually use it?
>
> Was there some particular operational situation which led you to think
> that the kernel should have this capability? If so, please help us out here
> and lavishly describe it.
>
In these months, I saw following questions as
==
Q. I think I put qemu to xxxx cgroup but it never works!
A. You need to put all threads in qemu to cgroup.
==
'tasks' file is not useful interface for users, I think.
(Even if users tend to use put-task-before-exec scheme.)
IMHO, from user's side of view, 'tasks' file is a mystery.
TID(thread-ID) is one of secrets in Linux + pthread library. For example,
on RHEL6, to use gettid(), users has to use syscall() directly. And end-user
may not know about thread-ID which is hidden under pthreads.
IIRC, there are no interface other than /proc/<pid>/tasks which shows all
thread IDs of a process. But it's not atomic.
So, I think it's ok to have 'procs' interface for cgroup if
overhead/impact of patch is not heavy.
Thanks,
-Kame
On Thu, Feb 10, 2011 at 10:02:10AM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 9 Feb 2011 15:10:46 -0800
> Andrew Morton <[email protected]> wrote:
>
> > On Mon, 7 Feb 2011 20:35:42 -0500
> > Ben Blum <[email protected]> wrote:
> >
> > > On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
> > > > On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> > > > > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > > > > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > > > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > > > > > >
> > > > > > > This patch series implements a write function for the 'cgroup.procs'
> > > > > > > per-cgroup file, which enables atomic movement of multithreaded
> > > > > > > applications between cgroups. Writing the thread-ID of any thread in a
> > > > > > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > > > > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > > > > > (Possible usage scenario: If running a multithreaded build system that
> > > > > > > sucks up system resources, this lets you restrict it all at once into a
> > > > > > > new cgroup to keep it under control.)
> > > > > > >
> > > > > > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > > > > > >
> > > > > > > # cat /dev/cgroup/tasks
> > > > > > > ...
> > > > > > > 31337
> > > > > > > 31338
> > > > > > > 31339
> > > > > > > # mkdir /dev/cgroup/foo
> > > > > > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > > > > > # cat /dev/cgroup/foo/tasks
> > > > > > > 31337
> > > > > > > 31338
> > > > > > > 31339
> > > > > > >
> > > > > > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > > > > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > > > > > taken for writing during the operation, and taking for reading in fork()
> > > > > > > around the calls to cgroup_fork() and cgroup_post_fork().
> >
> > The above six month old text is the best (and almost the only)
> > explanation of the rationale for the entire patch series. Is
> > it still correct and complete?
Yep, it's still fresh. (That's why I kept it around!)
> >
> >
> > Assuming "yes", then... how do we determine whether the feature is
> > sufficiently useful to justify merging and maintaining it? Will people
> > actually use it?
> >
> > Was there some particular operational situation which led you to think
> > that the kernel should have this capability? If so, please help us out here
> > and lavishly describe it.
> >
>
> In these months, I saw following questions as
> ==
> Q. I think I put qemu to xxxx cgroup but it never works!
> A. You need to put all threads in qemu to cgroup.
> ==
>
> 'tasks' file is not useful interface for users, I think.
> (Even if users tend to use put-task-before-exec scheme.)
>
>
> IMHO, from user's side of view, 'tasks' file is a mystery.
>
> TID(thread-ID) is one of secrets in Linux + pthread library. For example,
> on RHEL6, to use gettid(), users has to use syscall() directly. And end-user
> may not know about thread-ID which is hidden under pthreads.
I think glibc in general is to blame for the fact that you need to
syscall(__NR_gettid)? Regardless - yes, exposing an interface dealing
with task_structs can be less than perfect for a world that deals in
userland applications.
> IIRC, there are no interface other than /proc/<pid>/tasks which shows all
> thread IDs of a process. But it's not atomic.
I tend to use pgrep, which is a bit of a hassle.
Also, like in the six-month-old-text, many resource-sucking programs
nowadays (web browsers) are multithreaded.
> So, I think it's ok to have 'procs' interface for cgroup if
> overhead/impact of patch is not heavy.
>
> Thanks,
> -Kame
Thanks for the reasoning. ;)
-- Ben
On Mon, Jan 24, 2011 at 1:05 PM, Andrew Morton
<[email protected]> wrote:
>
> Risky. sched.h doesn't include rwsem.h.
>
> We could make it do so, but almost every compilation unit in the kernel
> includes sched.h. ?It would be nicer to make the kernel build
> finer-grained, rather than blunter-grained. ?Don't be afraid to add new
> header files if that is one way of doing this!
>
The only header files included by rwsem.h that aren't directly
included in sched.h already are linux/linkage.h and asm/atomic.h.
Since sighand_struct in sched.h has an atomic_t field, sched.h is
clearly including atomic.h somewhere indirectly. And there are mutex
fields in sched.h, which means it's indirectly including
linux/mutex.h, which includes linux/linkage.h. So I think that it's
hard to argue that this change would make the kernel build any more
heavyweight.
Paul
On Wed, Feb 9, 2011 at 5:02 PM, KAMEZAWA Hiroyuki
<[email protected]> wrote:
>
> So, I think it's ok to have 'procs' interface for cgroup if
> overhead/impact of patch is not heavy.
>
Agreed - it's definitely an operation that comes up as either
confusing or annoying for users, depending on whether or not they
understand how threads and cgroups interact. (We've been getting
people wanting to do this internally at Google, and I'm guessing that
we're one of the bigger users of cgroups.)
In theory it's something that could be handled in userspace, in one of two ways:
- repeatedly scan the old cgroup's tasks file and sweep any threads
from the given process into the destination cgroup, until you complete
a clean sweep finding none. (Possibly even this is racy if a thread is
being slow to fork)
- use a process event notifier to catch thread fork events and keep
track of any newly created threads that appear after your first sweep
of threads, and be prepared to handle them for some reasonable length
of time (tens of milliseconds?) after the last thread has been
apparently moved.
(The alternative approach, of course, is to give up and never try to
move a process into a cgroup except right when you're in the middle of
forking it, before the exec(), when you know that it has only a single
thread and you're in control of it.)
These are both painful procedures, compared to the very simple
approach of letting the kernel move the entire process atomically.
It's true that it's a pretty heavyweight operation, but that weight is
only paid when you actually use it on a very large process (and which
would be even more expensive to do in userspace). For the rest of the
kernel, it's just an extra read lock in the fork path on a semaphore
in a structure that's pretty much guaranteed to be in cache.
Paul
On Wed, Feb 9, 2011 at 3:10 PM, Andrew Morton <[email protected]> wrote:
> On Mon, 7 Feb 2011 20:35:42 -0500
> Ben Blum <[email protected]> wrote:
>
>> On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
>> > On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
>> > > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
>> > > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
>> > > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
>> > > > >
>> > > > > This patch series implements a write function for the 'cgroup.procs'
>> > > > > per-cgroup file, which enables atomic movement of multithreaded
>> > > > > applications between cgroups. Writing the thread-ID of any thread in a
>> > > > > threadgroup to a cgroup's procs file causes all threads in the group to
>> > > > > be moved to that cgroup safely with respect to threads forking/exiting.
>> > > > > (Possible usage scenario: If running a multithreaded build system that
>> > > > > sucks up system resources, this lets you restrict it all at once into a
>> > > > > new cgroup to keep it under control.)
>> > > > >
>
> The above six month old text is the best (and almost the only)
> explanation of the rationale for the entire patch series. ?Is
> it still correct and complete?
>
It's still correct, but I'm sure we could come up with a more detailed
justification if necessary.
Paul
Convert cgroup_attach_proc to use flex_array.
From: Ben Blum <[email protected]>
The cgroup_attach_proc implementation requires a pre-allocated array to store
task pointers to atomically move a thread-group, but asking for a monolithic
array with kmalloc() may be unreliable for very large groups. Using flex_array
provides the same functionality with less risk of failure.
This is a post-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
kernel/cgroup.c | 37 ++++++++++++++++++++++++++++---------
1 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 58b364a..feba784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -1985,7 +1986,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
- struct task_struct **group;
+ struct flex_array *group;
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
@@ -2002,9 +2003,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* and if threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
- group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
if (!group)
return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
/* prevent changes to the threadgroup list while we take a snapshot. */
rcu_read_lock();
@@ -2027,7 +2034,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
get_task_struct(tsk);
- group[i] = tsk;
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
@@ -2050,7 +2062,9 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
if (ss->can_attach_task) {
/* run on each task in the threadgroup. */
for (i = 0; i < group_size; i++) {
- retval = ss->can_attach_task(cgrp, group[i]);
+ tsk = flex_array_get_ptr(group, i);
+ BUG_ON(tsk == NULL);
+ retval = ss->can_attach_task(cgrp, tsk);
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
@@ -2065,7 +2079,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
+ BUG_ON(tsk == NULL);
/* nothing to do if this task is already in the cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2104,7 +2119,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
ss->pre_attach(cgrp);
}
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
+ BUG_ON(tsk == NULL);
/* leave current thread as it is if it's already there */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2154,10 +2170,13 @@ out_cancel_attach:
}
}
/* clean up the array of referenced threads in the group. */
- for (i = 0; i < group_size; i++)
- put_task_struct(group[i]);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ BUG_ON(tsk == NULL);
+ put_task_struct(tsk);
+ }
out_free_group_list:
- kfree(group);
+ flex_array_free(group);
return retval;
}
On Wed, Feb 16, 2011 at 11:22 AM, Ben Blum <[email protected]> wrote:
> Convert cgroup_attach_proc to use flex_array.
>
> From: Ben Blum <[email protected]>
>
> The cgroup_attach_proc implementation requires a pre-allocated array to store
> task pointers to atomically move a thread-group, but asking for a monolithic
> array with kmalloc() may be unreliable for very large groups. Using flex_array
> provides the same functionality with less risk of failure.
>
> This is a post-patch for cgroup-procs-write.patch.
>
> Signed-off-by: Ben Blum <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
Looks fine from a correctness point of view, but I'd be inclined to
reduce the verbosity - rather than
tsk = flex_array_get_ptr(group, i);
BUG_ON(tsk == NULL);
retval = ss->can_attach_task(cgrp, tsk);
I'd just have
retval = ss->can_attach_task(cgrp, flex_array_get_ptr(group, i));
I don't think you need to be so defensive about flex_array's behaviour.
Paul
On Mon, Feb 7, 2011 at 5:37 PM, Ben Blum <[email protected]> wrote:
> Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
>
> From: Ben Blum <[email protected]>
>
> This patch adds an rwsem that lives in a threadgroup's signal_struct that's
> taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
> the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
> ifdefs should be changed to a higher-up flag that CGROUPS and the other system
> would both depend on.
>
> This is a pre-patch for cgroup-procs-write.patch.
>
> Signed-off-by: Ben Blum <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
AFAICS, the only change from the previous version of this patch is the
addition of including linux/rwsem.h in sched.h, so I think it's fair
to assume my previous Reviewed-by: tag still holds.
(Incidentally, does anyone have any handy tools for tracking diffs
between things you've previously tagged as Acked or Reviewed-by, and
newer versions?
Paul
> ---
> ?include/linux/init_task.h | ? ?9 +++++++++
> ?include/linux/sched.h ? ? | ? 37 +++++++++++++++++++++++++++++++++++++
> ?kernel/fork.c ? ? ? ? ? ? | ? 10 ++++++++++
> ?3 files changed, 56 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 6b281fa..b560381 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -15,6 +15,14 @@
> ?extern struct files_struct init_files;
> ?extern struct fs_struct init_fs;
>
> +#ifdef CONFIG_CGROUPS
> +#define INIT_THREADGROUP_FORK_LOCK(sig) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> + ? ? ? .threadgroup_fork_lock = ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> + ? ? ? ? ? ? ? __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
> +#else
> +#define INIT_THREADGROUP_FORK_LOCK(sig)
> +#endif
> +
> ?#define INIT_SIGNALS(sig) { ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> ? ? ? ?.nr_threads ? ? = 1, ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> ? ? ? ?.wait_chldexit ?= __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
> @@ -31,6 +39,7 @@ extern struct fs_struct init_fs;
> ? ? ? ?}, ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> ? ? ? ?.cred_guard_mutex = ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
> ? ? ? ? ? ? ? ? __MUTEX_INITIALIZER(sig.cred_guard_mutex), ? ? ? ? ? ? \
> + ? ? ? INIT_THREADGROUP_FORK_LOCK(sig) ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \
> ?}
>
> ?extern struct nsproxy init_nsproxy;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8580dc6..2fdbeb1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -509,6 +509,8 @@ struct thread_group_cputimer {
> ? ? ? ?spinlock_t lock;
> ?};
>
> +#include <linux/rwsem.h>
> +
> ?/*
> ?* NOTE! "signal_struct" does not have it's own
> ?* locking, because a shared signal_struct always
> @@ -623,6 +625,16 @@ struct signal_struct {
> ? ? ? ?unsigned audit_tty;
> ? ? ? ?struct tty_audit_buf *tty_audit_buf;
> ?#endif
> +#ifdef CONFIG_CGROUPS
> + ? ? ? /*
> + ? ? ? ?* The threadgroup_fork_lock prevents threads from forking with
> + ? ? ? ?* CLONE_THREAD while held for writing. Use this for fork-sensitive
> + ? ? ? ?* threadgroup-wide operations. It's taken for reading in fork.c in
> + ? ? ? ?* copy_process().
> + ? ? ? ?* Currently only needed write-side by cgroups.
> + ? ? ? ?*/
> + ? ? ? struct rw_semaphore threadgroup_fork_lock;
> +#endif
>
> ? ? ? ?int oom_adj; ? ? ? ? ? ?/* OOM kill score adjustment (bit shift) */
> ? ? ? ?int oom_score_adj; ? ? ?/* OOM kill score adjustment */
> @@ -2270,6 +2282,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
> ? ? ? ?spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
> ?}
>
> +/* See the declaration of threadgroup_fork_lock in signal_struct. */
> +#ifdef CONFIG_CGROUPS
> +static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
> +{
> + ? ? ? down_read(&tsk->signal->threadgroup_fork_lock);
> +}
> +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
> +{
> + ? ? ? up_read(&tsk->signal->threadgroup_fork_lock);
> +}
> +static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
> +{
> + ? ? ? down_write(&tsk->signal->threadgroup_fork_lock);
> +}
> +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
> +{
> + ? ? ? up_write(&tsk->signal->threadgroup_fork_lock);
> +}
> +#else
> +static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
> +static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
> +static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
> +static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
> +#endif
> +
> ?#ifndef __HAVE_THREAD_FUNCTIONS
>
> ?#define task_thread_info(task) ((struct thread_info *)(task)->stack)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 0979527..aefe61f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -905,6 +905,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>
> ? ? ? ?tty_audit_fork(sig);
>
> +#ifdef CONFIG_CGROUPS
> + ? ? ? init_rwsem(&sig->threadgroup_fork_lock);
> +#endif
> +
> ? ? ? ?sig->oom_adj = current->signal->oom_adj;
> ? ? ? ?sig->oom_score_adj = current->signal->oom_score_adj;
> ? ? ? ?sig->oom_score_adj_min = current->signal->oom_score_adj_min;
> @@ -1087,6 +1091,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> ? ? ? ?monotonic_to_bootbased(&p->real_start_time);
> ? ? ? ?p->io_context = NULL;
> ? ? ? ?p->audit_context = NULL;
> + ? ? ? if (clone_flags & CLONE_THREAD)
> + ? ? ? ? ? ? ? threadgroup_fork_read_lock(current);
> ? ? ? ?cgroup_fork(p);
> ?#ifdef CONFIG_NUMA
> ? ? ? ?p->mempolicy = mpol_dup(p->mempolicy);
> @@ -1294,6 +1300,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> ? ? ? ?write_unlock_irq(&tasklist_lock);
> ? ? ? ?proc_fork_connector(p);
> ? ? ? ?cgroup_post_fork(p);
> + ? ? ? if (clone_flags & CLONE_THREAD)
> + ? ? ? ? ? ? ? threadgroup_fork_read_unlock(current);
> ? ? ? ?perf_event_fork(p);
> ? ? ? ?return p;
>
> @@ -1332,6 +1340,8 @@ bad_fork_cleanup_policy:
> ? ? ? ?mpol_put(p->mempolicy);
> ?bad_fork_cleanup_cgroup:
> ?#endif
> + ? ? ? if (clone_flags & CLONE_THREAD)
> + ? ? ? ? ? ? ? threadgroup_fork_read_unlock(current);
> ? ? ? ?cgroup_exit(p, cgroup_callbacks_done);
> ? ? ? ?delayacct_tsk_free(p);
> ? ? ? ?module_put(task_thread_info(p)->exec_domain->module);
>
On Mon, Feb 7, 2011 at 5:39 PM, Ben Blum <[email protected]> wrote:
> Add cgroup subsystem callbacks for per-thread attachment
>
> From: Ben Blum <[email protected]>
>
> This patch adds can_attach_task, pre_attach, and attach_task as new callbacks
> for cgroups's subsystem interface. Unlike can_attach and attach, these are for
> per-thread operations, to be called potentially many times when attaching an
> entire threadgroup.
>
> Also, the old "bool threadgroup" interface is removed, as replaced by this.
> All subsystems are modified for the new interface - of note is cpuset, which
> requires from/to nodemasks for attach to be globally scoped (though per-cpuset
> would work too) to persist from its pre_attach to attach_task and attach.
>
> This is a pre-patch for cgroup-procs-writable.patch.
>
> Signed-off-by: Ben Blum <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
Paul
> ---
> ?Documentation/cgroups/cgroups.txt | ? 30 ++++++++---
> ?block/blk-cgroup.c ? ? ? ? ? ? ? ?| ? 18 ++----
> ?include/linux/cgroup.h ? ? ? ? ? ?| ? 10 ++--
> ?kernel/cgroup.c ? ? ? ? ? ? ? ? ? | ? 17 +++++-
> ?kernel/cgroup_freezer.c ? ? ? ? ? | ? 26 ++++-----
> ?kernel/cpuset.c ? ? ? ? ? ? ? ? ? | ?105 ++++++++++++++++---------------------
> ?kernel/ns_cgroup.c ? ? ? ? ? ? ? ?| ? 23 +++-----
> ?kernel/sched.c ? ? ? ? ? ? ? ? ? ?| ? 38 +------------
> ?mm/memcontrol.c ? ? ? ? ? ? ? ? ? | ? 18 ++----
> ?security/device_cgroup.c ? ? ? ? ?| ? ?3 -
> ?10 files changed, 122 insertions(+), 166 deletions(-)
>
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index 190018b..d3c9a24 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -563,7 +563,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
> ?called multiple times against a cgroup.
>
> ?int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ?struct task_struct *task, bool threadgroup)
> + ? ? ? ? ? ? ?struct task_struct *task)
> ?(cgroup_mutex held by caller)
>
> ?Called prior to moving a task into a cgroup; if the subsystem
> @@ -572,9 +572,14 @@ task is passed, then a successful result indicates that *any*
> ?unspecified task can be moved into the cgroup. Note that this isn't
> ?called on a fork. If this method returns 0 (success) then this should
> ?remain valid while the caller holds cgroup_mutex and it is ensured that either
> -attach() or cancel_attach() will be called in future. If threadgroup is
> -true, then a successful result indicates that all threads in the given
> -thread's threadgroup can be moved together.
> +attach() or cancel_attach() will be called in future.
> +
> +int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
> +(cgroup_mutex held by caller)
> +
> +As can_attach, but for operations that must be run once per task to be
> +attached (possibly many when using cgroup_attach_proc). Called after
> +can_attach.
>
> ?void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> ? ? ? ? ? ? ? struct task_struct *task, bool threadgroup)
> @@ -586,15 +591,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
> ?This will be called only about subsystems whose can_attach() operation have
> ?succeeded.
>
> +void pre_attach(struct cgroup *cgrp);
> +(cgroup_mutex held by caller)
> +
> +For any non-per-thread attachment work that needs to happen before
> +attach_task. Needed by cpuset.
> +
> ?void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? struct cgroup *old_cgrp, struct task_struct *task,
> - ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? struct cgroup *old_cgrp, struct task_struct *task)
> ?(cgroup_mutex held by caller)
>
> ?Called after the task has been attached to the cgroup, to allow any
> ?post-attachment activity that requires memory allocations or blocking.
> -If threadgroup is true, the subsystem should take care of all threads
> -in the specified thread's threadgroup. Currently does not support any
> +
> +void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
> +(cgroup_mutex held by caller)
> +
> +As attach, but for operations that must be run once per task to be attached,
> +like can_attach_task. Called before attach. Currently does not support any
> ?subsystem that might need the old_cgrp for every thread in the group.
>
> ?void fork(struct cgroup_subsy *ss, struct task_struct *task)
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index b1febd0..45b3809 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>
> ?static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *);
> -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *, bool);
> -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
> - ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *, struct task_struct *, bool);
> +static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
> +static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
> ?static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
> ?static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
>
> @@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
> ?struct cgroup_subsys blkio_subsys = {
> ? ? ? ?.name = "blkio",
> ? ? ? ?.create = blkiocg_create,
> - ? ? ? .can_attach = blkiocg_can_attach,
> - ? ? ? .attach = blkiocg_attach,
> + ? ? ? .can_attach_task = blkiocg_can_attach_task,
> + ? ? ? .attach_task = blkiocg_attach_task,
> ? ? ? ?.destroy = blkiocg_destroy,
> ? ? ? ?.populate = blkiocg_populate,
> ?#ifdef CONFIG_BLK_CGROUP
> @@ -1475,9 +1473,7 @@ done:
> ?* of the main cic data structures. ?For now we allow a task to change
> ?* its cgroup only if it's the only owner of its ioc.
> ?*/
> -static int blkiocg_can_attach(struct cgroup_subsys *subsys,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cgroup *cgroup, struct task_struct *tsk,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> +static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ?{
> ? ? ? ?struct io_context *ioc;
> ? ? ? ?int ret = 0;
> @@ -1492,9 +1488,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
> ? ? ? ?return ret;
> ?}
>
> -static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct cgroup *prev, struct task_struct *tsk,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> +static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ?{
> ? ? ? ?struct io_context *ioc;
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index ce104e3..35b69b4 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -467,12 +467,14 @@ struct cgroup_subsys {
> ? ? ? ?int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> ? ? ? ?void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> ? ? ? ?int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk, bool threadgroup);
> + ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk);
> + ? ? ? int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
> ? ? ? ?void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk, bool threadgroup);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk);
> + ? ? ? void (*pre_attach)(struct cgroup *cgrp);
> + ? ? ? void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
> ? ? ? ?void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ? ? ? ? ? struct cgroup *old_cgrp, struct task_struct *tsk,
> - ? ? ? ? ? ? ? ? ? ? ? bool threadgroup);
> + ? ? ? ? ? ? ? ? ? ? ?struct cgroup *old_cgrp, struct task_struct *tsk);
> ? ? ? ?void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
> ? ? ? ?void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
> ? ? ? ?int (*populate)(struct cgroup_subsys *ss,
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 66a416b..616f27a 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1750,7 +1750,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
>
> ? ? ? ?for_each_subsys(root, ss) {
> ? ? ? ? ? ? ? ?if (ss->can_attach) {
> - ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, tsk, false);
> + ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, tsk);
> ? ? ? ? ? ? ? ? ? ? ? ?if (retval) {
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? * Remember on which subsystem the can_attach()
> @@ -1762,6 +1762,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?goto out;
> ? ? ? ? ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?}
> + ? ? ? ? ? ? ? if (ss->can_attach_task) {
> + ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach_task(cgrp, tsk);
> + ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? }
> ? ? ? ?}
>
> ? ? ? ?task_lock(tsk);
> @@ -1798,8 +1805,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ? ? ? ?write_unlock(&css_set_lock);
>
> ? ? ? ?for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? if (ss->pre_attach)
> + ? ? ? ? ? ? ? ? ? ? ? ss->pre_attach(cgrp);
> + ? ? ? ? ? ? ? if (ss->attach_task)
> + ? ? ? ? ? ? ? ? ? ? ? ss->attach_task(cgrp, tsk);
> ? ? ? ? ? ? ? ?if (ss->attach)
> - ? ? ? ? ? ? ? ? ? ? ? ss->attach(ss, cgrp, oldcgrp, tsk, false);
> + ? ? ? ? ? ? ? ? ? ? ? ss->attach(ss, cgrp, oldcgrp, tsk);
> ? ? ? ?}
> ? ? ? ?set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> ? ? ? ?synchronize_rcu();
> @@ -1822,7 +1833,7 @@ out:
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? */
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?break;
> ? ? ? ? ? ? ? ? ? ? ? ?if (ss->cancel_attach)
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ss->cancel_attach(ss, cgrp, tsk, false);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ss->cancel_attach(ss, cgrp, tsk);
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
> ? ? ? ?return retval;
> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> index e7bebb7..e691818 100644
> --- a/kernel/cgroup_freezer.c
> +++ b/kernel/cgroup_freezer.c
> @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
> ?*/
> ?static int freezer_can_attach(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *new_cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *task, bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *task)
> ?{
> ? ? ? ?struct freezer *freezer;
>
> @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
> ? ? ? ?if (freezer->state != CGROUP_THAWED)
> ? ? ? ? ? ? ? ?return -EBUSY;
>
> + ? ? ? return 0;
> +}
> +
> +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> +{
> ? ? ? ?rcu_read_lock();
> - ? ? ? if (__cgroup_freezing_or_frozen(task)) {
> + ? ? ? if (__cgroup_freezing_or_frozen(tsk)) {
> ? ? ? ? ? ? ? ?rcu_read_unlock();
> ? ? ? ? ? ? ? ?return -EBUSY;
> ? ? ? ?}
> ? ? ? ?rcu_read_unlock();
> -
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> -
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? if (__cgroup_freezing_or_frozen(c)) {
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return -EBUSY;
> - ? ? ? ? ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> -
> ? ? ? ?return 0;
> ?}
>
> @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
> ? ? ? ?.populate ? ? ? = freezer_populate,
> ? ? ? ?.subsys_id ? ? ?= freezer_subsys_id,
> ? ? ? ?.can_attach ? ? = freezer_can_attach,
> + ? ? ? .can_attach_task = freezer_can_attach_task,
> + ? ? ? .pre_attach ? ? = NULL,
> + ? ? ? .attach_task ? ?= NULL,
> ? ? ? ?.attach ? ? ? ? = NULL,
> ? ? ? ?.fork ? ? ? ? ? = freezer_fork,
> ? ? ? ?.exit ? ? ? ? ? = NULL,
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 4349935..5f71ca2 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1372,14 +1372,10 @@ static int fmeter_getrate(struct fmeter *fmp)
> ? ? ? ?return val;
> ?}
>
> -/* Protected by cgroup_lock */
> -static cpumask_var_t cpus_attach;
> -
> ?/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
> ?static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct task_struct *tsk, bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct task_struct *tsk)
> ?{
> - ? ? ? int ret;
> ? ? ? ?struct cpuset *cs = cgroup_cs(cont);
>
> ? ? ? ?if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
> @@ -1396,29 +1392,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> ? ? ? ?if (tsk->flags & PF_THREAD_BOUND)
> ? ? ? ? ? ? ? ?return -EINVAL;
>
> - ? ? ? ret = security_task_setscheduler(tsk);
> - ? ? ? if (ret)
> - ? ? ? ? ? ? ? return ret;
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> -
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? ret = security_task_setscheduler(c);
> - ? ? ? ? ? ? ? ? ? ? ? if (ret) {
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return ret;
> - ? ? ? ? ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> ? ? ? ?return 0;
> ?}
>
> -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cpuset *cs)
> +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
> +{
> + ? ? ? return security_task_setscheduler(task);
> +}
> +
> +/*
> + * Protected by cgroup_lock. The nodemasks must be stored globally because
> + * dynamically allocating them is not allowed in pre_attach, and they must
> + * persist among pre_attach, attach_task, and attach.
> + */
> +static cpumask_var_t cpus_attach;
> +static nodemask_t cpuset_attach_nodemask_from;
> +static nodemask_t cpuset_attach_nodemask_to;
> +
> +/* Set-up work for before attaching each task. */
> +static void cpuset_pre_attach(struct cgroup *cont)
> +{
> + ? ? ? struct cpuset *cs = cgroup_cs(cont);
> +
> + ? ? ? if (cs == &top_cpuset)
> + ? ? ? ? ? ? ? cpumask_copy(cpus_attach, cpu_possible_mask);
> + ? ? ? else
> + ? ? ? ? ? ? ? guarantee_online_cpus(cs, cpus_attach);
> +
> + ? ? ? guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
> +}
> +
> +/* Per-thread attachment work. */
> +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
> ?{
> ? ? ? ?int err;
> + ? ? ? struct cpuset *cs = cgroup_cs(cont);
> +
> ? ? ? ?/*
> ? ? ? ? * can_attach beforehand should guarantee that this doesn't fail.
> ? ? ? ? * TODO: have a better way to handle failure here
> @@ -1426,56 +1435,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
> ? ? ? ?err = set_cpus_allowed_ptr(tsk, cpus_attach);
> ? ? ? ?WARN_ON_ONCE(err);
>
> - ? ? ? cpuset_change_task_nodemask(tsk, to);
> + ? ? ? cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
> ? ? ? ?cpuset_update_task_spread_flag(cs, tsk);
> -
> ?}
>
> ?static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> - ? ? ? ? ? ? ? ? ? ? ? ? struct cgroup *oldcont, struct task_struct *tsk,
> - ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? struct cgroup *oldcont, struct task_struct *tsk)
> ?{
> ? ? ? ?struct mm_struct *mm;
> ? ? ? ?struct cpuset *cs = cgroup_cs(cont);
> ? ? ? ?struct cpuset *oldcs = cgroup_cs(oldcont);
> - ? ? ? NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
> - ? ? ? NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
> -
> - ? ? ? if (from == NULL || to == NULL)
> - ? ? ? ? ? ? ? goto alloc_fail;
>
> - ? ? ? if (cs == &top_cpuset) {
> - ? ? ? ? ? ? ? cpumask_copy(cpus_attach, cpu_possible_mask);
> - ? ? ? } else {
> - ? ? ? ? ? ? ? guarantee_online_cpus(cs, cpus_attach);
> - ? ? ? }
> - ? ? ? guarantee_online_mems(cs, to);
> -
> - ? ? ? /* do per-task migration stuff possibly for each in the threadgroup */
> - ? ? ? cpuset_attach_task(tsk, to, cs);
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? cpuset_attach_task(c, to, cs);
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> -
> - ? ? ? /* change mm; only needs to be done once even if threadgroup */
> - ? ? ? *from = oldcs->mems_allowed;
> - ? ? ? *to = cs->mems_allowed;
> + ? ? ? /*
> + ? ? ? ?* Change mm, possibly for multiple threads in a threadgroup. This is
> + ? ? ? ?* expensive and may sleep.
> + ? ? ? ?*/
> + ? ? ? cpuset_attach_nodemask_from = oldcs->mems_allowed;
> + ? ? ? cpuset_attach_nodemask_to = cs->mems_allowed;
> ? ? ? ?mm = get_task_mm(tsk);
> ? ? ? ?if (mm) {
> - ? ? ? ? ? ? ? mpol_rebind_mm(mm, to);
> + ? ? ? ? ? ? ? mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
> ? ? ? ? ? ? ? ?if (is_memory_migrate(cs))
> - ? ? ? ? ? ? ? ? ? ? ? cpuset_migrate_mm(mm, from, to);
> + ? ? ? ? ? ? ? ? ? ? ? cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? &cpuset_attach_nodemask_to);
> ? ? ? ? ? ? ? ?mmput(mm);
> ? ? ? ?}
> -
> -alloc_fail:
> - ? ? ? NODEMASK_FREE(from);
> - ? ? ? NODEMASK_FREE(to);
> ?}
>
> ?/* The various types of files and directories in a cpuset file system */
> @@ -1928,6 +1912,9 @@ struct cgroup_subsys cpuset_subsys = {
> ? ? ? ?.create = cpuset_create,
> ? ? ? ?.destroy = cpuset_destroy,
> ? ? ? ?.can_attach = cpuset_can_attach,
> + ? ? ? .can_attach_task = cpuset_can_attach_task,
> + ? ? ? .pre_attach = cpuset_pre_attach,
> + ? ? ? .attach_task = cpuset_attach_task,
> ? ? ? ?.attach = cpuset_attach,
> ? ? ? ?.populate = cpuset_populate,
> ? ? ? ?.post_clone = cpuset_post_clone,
> diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
> index 2c98ad9..1fc2b1b 100644
> --- a/kernel/ns_cgroup.c
> +++ b/kernel/ns_cgroup.c
> @@ -43,7 +43,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
> ?* ? ? ? ?ancestor cgroup thereof)
> ?*/
> ?static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ?struct task_struct *task, bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ?struct task_struct *task)
> ?{
> ? ? ? ?if (current != task) {
> ? ? ? ? ? ? ? ?if (!capable(CAP_SYS_ADMIN))
> @@ -53,21 +53,13 @@ static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
> ? ? ? ? ? ? ? ? ? ? ? ?return -EPERM;
> ? ? ? ?}
>
> - ? ? ? if (!cgroup_is_descendant(new_cgroup, task))
> - ? ? ? ? ? ? ? return -EPERM;
> -
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? if (!cgroup_is_descendant(new_cgroup, c)) {
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return -EPERM;
> - ? ? ? ? ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> + ? ? ? return 0;
> +}
>
> +static int ns_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> +{
> + ? ? ? if (!cgroup_is_descendant(cgrp, tsk))
> + ? ? ? ? ? ? ? return -EPERM;
> ? ? ? ?return 0;
> ?}
>
> @@ -112,6 +104,7 @@ static void ns_destroy(struct cgroup_subsys *ss,
> ?struct cgroup_subsys ns_subsys = {
> ? ? ? ?.name = "ns",
> ? ? ? ?.can_attach = ns_can_attach,
> + ? ? ? .can_attach_task = ns_can_attach_task,
> ? ? ? ?.create = ns_create,
> ? ? ? ?.destroy ?= ns_destroy,
> ? ? ? ?.subsys_id = ns_subsys_id,
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 218ef20..d619f1d 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -8655,42 +8655,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ? ? ? ?return 0;
> ?}
>
> -static int
> -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk, bool threadgroup)
> -{
> - ? ? ? int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
> - ? ? ? if (retval)
> - ? ? ? ? ? ? ? return retval;
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? retval = cpu_cgroup_can_attach_task(cgrp, c);
> - ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? return retval;
> - ? ? ? ? ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> - ? ? ? return 0;
> -}
> -
> ?static void
> -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
> - ? ? ? ? ? ? ? ? struct cgroup *old_cont, struct task_struct *tsk,
> - ? ? ? ? ? ? ? ? bool threadgroup)
> +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ?{
> ? ? ? ?sched_move_task(tsk);
> - ? ? ? if (threadgroup) {
> - ? ? ? ? ? ? ? struct task_struct *c;
> - ? ? ? ? ? ? ? rcu_read_lock();
> - ? ? ? ? ? ? ? list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> - ? ? ? ? ? ? ? ? ? ? ? sched_move_task(c);
> - ? ? ? ? ? ? ? }
> - ? ? ? ? ? ? ? rcu_read_unlock();
> - ? ? ? }
> ?}
>
> ?#ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -8763,8 +8731,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
> ? ? ? ?.name ? ? ? ? ? = "cpu",
> ? ? ? ?.create ? ? ? ? = cpu_cgroup_create,
> ? ? ? ?.destroy ? ? ? ?= cpu_cgroup_destroy,
> - ? ? ? .can_attach ? ? = cpu_cgroup_can_attach,
> - ? ? ? .attach ? ? ? ? = cpu_cgroup_attach,
> + ? ? ? .can_attach_task = cpu_cgroup_can_attach_task,
> + ? ? ? .attach_task ? ?= cpu_cgroup_attach_task,
> ? ? ? ?.populate ? ? ? = cpu_cgroup_populate,
> ? ? ? ?.subsys_id ? ? ?= cpu_cgroup_subsys_id,
> ? ? ? ?.early_init ? ? = 1,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 729beb7..995f0b9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4720,8 +4720,7 @@ static void mem_cgroup_clear_mc(void)
>
> ?static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ? ? ? ?int ret = 0;
> ? ? ? ?struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
> @@ -4775,8 +4774,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
>
> ?static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ? ? ? ?mem_cgroup_clear_mc();
> ?}
> @@ -4880,8 +4878,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
> ?static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cont,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *old_cont,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ? ? ? ?if (!mc.mm)
> ? ? ? ? ? ? ? ?/* no need to move charge */
> @@ -4893,22 +4890,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> ?#else ?/* !CONFIG_MMU */
> ?static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ? ? ? ?return 0;
> ?}
> ?static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cgroup,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ?}
> ?static void mem_cgroup_move_task(struct cgroup_subsys *ss,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *cont,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cgroup *old_cont,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *p)
> ?{
> ?}
> ?#endif
> diff --git a/security/device_cgroup.c b/security/device_cgroup.c
> index 8d9c48f..cd1f779 100644
> --- a/security/device_cgroup.c
> +++ b/security/device_cgroup.c
> @@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
> ?struct cgroup_subsys devices_subsys;
>
> ?static int devcgroup_can_attach(struct cgroup_subsys *ss,
> - ? ? ? ? ? ? ? struct cgroup *new_cgroup, struct task_struct *task,
> - ? ? ? ? ? ? ? bool threadgroup)
> + ? ? ? ? ? ? ? struct cgroup *new_cgroup, struct task_struct *task)
> ?{
> ? ? ? ?if (current != task && !capable(CAP_SYS_ADMIN))
> ? ? ? ? ? ? ? ? ? ? ? ?return -EPERM;
>
On Mon, Feb 7, 2011 at 5:39 PM, Ben Blum <[email protected]> wrote:
> Makes procs file writable to move all threads by tgid at once
>
> From: Ben Blum <[email protected]>
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a per-threadgroup rwsem that's
> taken for reading in the fork() path to prevent newly forking threads within
> the threadgroup from "escaping" while the move is in progress.
>
> Signed-off-by: Ben Blum <[email protected]>
> ---
> + ? ? ? /* remember the number of threads in the array for later. */
> + ? ? ? BUG_ON(i == 0);
This BUG_ON() seems unnecessary, given the i++ directly above it.
> + ? ? ? group_size = i;
> + ? ? ? rcu_read_unlock();
> +
> + ? ? ? /*
> + ? ? ? ?* step 1: check that we can legitimately attach to the cgroup.
> + ? ? ? ?*/
> + ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? if (ss->can_attach) {
> + ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, leader);
> + ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out_cancel_attach;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? /* a callback to be run on every thread in the threadgroup. */
> + ? ? ? ? ? ? ? if (ss->can_attach_task) {
> + ? ? ? ? ? ? ? ? ? ? ? /* run on each task in the threadgroup. */
> + ? ? ? ? ? ? ? ? ? ? ? for (i = 0; i < group_size; i++) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach_task(cgrp, group[i]);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
Should we be setting failed_ss here? Doesn't that mean that if all
subsystems pass the can_attach() check but the first one fails a
can_attach_task() check, we don't call any cancel_attach() methods?
What are the rollback semantics for failing a can_attach_task() check?
> + ? ? ? ? ? ? ? if (threadgroup) {
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* it is safe to find group_leader because tsk was found
> + ? ? ? ? ? ? ? ? ? ? ? ?* in the tid map, meaning it can't have been unhashed
> + ? ? ? ? ? ? ? ? ? ? ? ?* by someone in de_thread changing the leadership.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? tsk = tsk->group_leader;
> + ? ? ? ? ? ? ? ? ? ? ? BUG_ON(!thread_group_leader(tsk));
Can this race with an exiting/execing group leader?
> + ? ? ? ? ? ? ? } else if (tsk->flags & PF_EXITING) {
The check for PF_EXITING doesn't apply to group leaders?
> + ? ? ? ? ? ? ? ? ? ? ? /* optimization for the single-task-only case */
> + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> ? ? ? ? ? ? ? ? ? ? ? ?return -ESRCH;
> ? ? ? ? ? ? ? ?}
>
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* even if we're attaching all tasks in the thread group, we
> + ? ? ? ? ? ? ? ?* only need to check permissions on one of them.
> + ? ? ? ? ? ? ? ?*/
> ? ? ? ? ? ? ? ?tcred = __task_cred(tsk);
> ? ? ? ? ? ? ? ?if (cred->euid &&
> ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->uid &&
> ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->suid) {
> ? ? ? ? ? ? ? ? ? ? ? ?rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> ? ? ? ? ? ? ? ? ? ? ? ?return -EACCES;
Maybe turn these returns into "goto out;" statements and put the
unlock after the out: label?
On Thu, Mar 03, 2011 at 10:38:58AM -0800, Paul Menage wrote:
> On Mon, Feb 7, 2011 at 5:39 PM, Ben Blum <[email protected]> wrote:
> > Makes procs file writable to move all threads by tgid at once
> >
> > From: Ben Blum <[email protected]>
> >
> > This patch adds functionality that enables users to move all threads in a
> > threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> > file. This current implementation makes use of a per-threadgroup rwsem that's
> > taken for reading in the fork() path to prevent newly forking threads within
> > the threadgroup from "escaping" while the move is in progress.
> >
> > Signed-off-by: Ben Blum <[email protected]>
> > ---
> > + ? ? ? /* remember the number of threads in the array for later. */
> > + ? ? ? BUG_ON(i == 0);
>
> This BUG_ON() seems unnecessary, given the i++ directly above it.
It's meant to communicate that the loop must go through at least once,
so that 'struct cgroup *oldcgrp' will be initialised within a loop later
(setting it to NULL in the beginning is just to shut up the compiler.)
>
> > + ? ? ? group_size = i;
> > + ? ? ? rcu_read_unlock();
> > +
> > + ? ? ? /*
> > + ? ? ? ?* step 1: check that we can legitimately attach to the cgroup.
> > + ? ? ? ?*/
> > + ? ? ? for_each_subsys(root, ss) {
> > + ? ? ? ? ? ? ? if (ss->can_attach) {
> > + ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, leader);
> > + ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out_cancel_attach;
> > + ? ? ? ? ? ? ? ? ? ? ? }
> > + ? ? ? ? ? ? ? }
> > + ? ? ? ? ? ? ? /* a callback to be run on every thread in the threadgroup. */
> > + ? ? ? ? ? ? ? if (ss->can_attach_task) {
> > + ? ? ? ? ? ? ? ? ? ? ? /* run on each task in the threadgroup. */
> > + ? ? ? ? ? ? ? ? ? ? ? for (i = 0; i < group_size; i++) {
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach_task(cgrp, group[i]);
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
>
> Should we be setting failed_ss here? Doesn't that mean that if all
> subsystems pass the can_attach() check but the first one fails a
> can_attach_task() check, we don't call any cancel_attach() methods?
>
> What are the rollback semantics for failing a can_attach_task() check?
They are not called in that order - it's for_each_subsys { can_attach();
can_attach_task(); }. Although if the deal is that cancel_attach reverts
the things that can_attach does (and can_attach_task is separate) (is
this the case? it should probably go in the documentation), then passing
a can_attach and failing a can_attach_task should cause cancel_attach to
get called for that subsystem, which in this code it doesn't. Something
like:
retval = ss->can_attach();
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
}
retval = ss->can_attach_task();
if (retval) {
failed_ss = ss;
cancel_extra_ss = true;
goto out_cancel_attach;
}
...
out_cancel_attach:
if (retval) {
for_each_subsys(root, ss) {
if (ss == failed_ss) {
if (cancel_extra_ss)
ss->cancel_attach();
break;
}
ss->cancel_attach();
}
}
>
> > + ? ? ? ? ? ? ? if (threadgroup) {
> > + ? ? ? ? ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ? ? ? ? ?* it is safe to find group_leader because tsk was found
> > + ? ? ? ? ? ? ? ? ? ? ? ?* in the tid map, meaning it can't have been unhashed
> > + ? ? ? ? ? ? ? ? ? ? ? ?* by someone in de_thread changing the leadership.
> > + ? ? ? ? ? ? ? ? ? ? ? ?*/
> > + ? ? ? ? ? ? ? ? ? ? ? tsk = tsk->group_leader;
> > + ? ? ? ? ? ? ? ? ? ? ? BUG_ON(!thread_group_leader(tsk));
>
> Can this race with an exiting/execing group leader?
No, rcu_read_lock() is held.
>
> > + ? ? ? ? ? ? ? } else if (tsk->flags & PF_EXITING) {
>
> The check for PF_EXITING doesn't apply to group leaders?
I remember discussing this bit a while back - the point that if the
leader is PF_EXITING, that we should still iterate over its group list.
(However, I did try to test it, and it looks like if a leader calls
sys_exit() then the whole group goes away; is this actually guaranteed?)
>
> > + ? ? ? ? ? ? ? ? ? ? ? /* optimization for the single-task-only case */
> > + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> > + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> > ? ? ? ? ? ? ? ? ? ? ? ?return -ESRCH;
> > ? ? ? ? ? ? ? ?}
> >
> > + ? ? ? ? ? ? ? /*
> > + ? ? ? ? ? ? ? ?* even if we're attaching all tasks in the thread group, we
> > + ? ? ? ? ? ? ? ?* only need to check permissions on one of them.
> > + ? ? ? ? ? ? ? ?*/
> > ? ? ? ? ? ? ? ?tcred = __task_cred(tsk);
> > ? ? ? ? ? ? ? ?if (cred->euid &&
> > ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->uid &&
> > ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->suid) {
> > ? ? ? ? ? ? ? ? ? ? ? ?rcu_read_unlock();
> > + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> > ? ? ? ? ? ? ? ? ? ? ? ?return -EACCES;
>
> Maybe turn these returns into "goto out;" statements and put the
> unlock after the out: label?
>
Maybe; I didn't look too hard at that function. If I revise the patch I
can do this, though.
Thanks,
Ben
On Wed, Mar 9, 2011 at 10:18 PM, Ben Blum <[email protected]> wrote:
>> This BUG_ON() seems unnecessary, given the i++ directly above it.
>
> It's meant to communicate that the loop must go through at least once,
> so that 'struct cgroup *oldcgrp' will be initialised within a loop later
> (setting it to NULL in the beginning is just to shut up the compiler.)
Right, but it's a do {} while() loop with no break in it - it's
impossible to not go through at least once...
>> Should we be setting failed_ss here? Doesn't that mean that if all
>> subsystems pass the can_attach() check but the first one fails a
>> can_attach_task() check, we don't call any cancel_attach() methods?
>>
>> What are the rollback semantics for failing a can_attach_task() check?
>
> They are not called in that order - it's for_each_subsys { can_attach();
> can_attach_task(); }.
Oh, fair point - I misread that.
> Although if the deal is that cancel_attach reverts
> the things that can_attach does (and can_attach_task is separate) (is
> this the case? it should probably go in the documentation), then passing
> a can_attach and failing a can_attach_task should cause cancel_attach to
> get called for that subsystem, which in this code it doesn't. Something
> like:
>
> ? ?retval = ss->can_attach();
> ? ?if (retval) {
> ? ? ? ?failed_ss = ss;
> ? ? ? ?goto out_cancel_attach;
> ? ?}
> ? ?retval = ss->can_attach_task();
> ? ?if (retval) {
> ? ? ? ?failed_ss = ss;
> ? ? ? ?cancel_extra_ss = true;
> ? ? ? ?goto out_cancel_attach;
> ? ?}
Yes, but maybe call the flag cancel_failed_ss? Slightly more obvious,
to me at least.
>> > + ? ? ? ? ? ? ? ? ? ? ? BUG_ON(!thread_group_leader(tsk));
>>
>> Can this race with an exiting/execing group leader?
>
> No, rcu_read_lock() is held.
>
But rcu_read_lock() doesn't stop any actions - it just stops the data
structures from going away. Can't leadership change during an
execve()?
> (However, I did try to test it, and it looks like if a leader calls
> sys_exit() then the whole group goes away; is this actually guaranteed?)
I think so, but maybe not instantaneously.
Paul
On Thu, Mar 10, 2011 at 12:01:29PM -0800, Paul Menage wrote:
> On Wed, Mar 9, 2011 at 10:18 PM, Ben Blum <[email protected]> wrote:
> >> This BUG_ON() seems unnecessary, given the i++ directly above it.
> >
> > It's meant to communicate that the loop must go through at least once,
> > so that 'struct cgroup *oldcgrp' will be initialised within a loop later
> > (setting it to NULL in the beginning is just to shut up the compiler.)
>
> Right, but it's a do {} while() loop with no break in it - it's
> impossible to not go through at least once...
OK; I guess it can go.
> > Although if the deal is that cancel_attach reverts
> > the things that can_attach does (and can_attach_task is separate) (is
> > this the case? it should probably go in the documentation), then passing
> > a can_attach and failing a can_attach_task should cause cancel_attach to
> > get called for that subsystem, which in this code it doesn't. Something
> > like:
> >
> > ? ?retval = ss->can_attach();
> > ? ?if (retval) {
> > ? ? ? ?failed_ss = ss;
> > ? ? ? ?goto out_cancel_attach;
> > ? ?}
> > ? ?retval = ss->can_attach_task();
> > ? ?if (retval) {
> > ? ? ? ?failed_ss = ss;
> > ? ? ? ?cancel_extra_ss = true;
> > ? ? ? ?goto out_cancel_attach;
> > ? ?}
>
> Yes, but maybe call the flag cancel_failed_ss? Slightly more obvious,
> to me at least.
Sounds good.
> >> > + ? ? ? ? ? ? ? ? ? ? ? BUG_ON(!thread_group_leader(tsk));
> >>
> >> Can this race with an exiting/execing group leader?
> >
> > No, rcu_read_lock() is held.
> >
>
> But rcu_read_lock() doesn't stop any actions - it just stops the data
> structures from going away. Can't leadership change during an
> execve()?
Hmm, you may be right; my understanding of RCU is not complete. But
actually I think the BUG_ON should just be removed, since we're about to
drop locks before handing off to cgroup_attach_proc anyway (so at no
important part is the assertion guaranteed), which will detect and
EAGAIN if such a race happened.
> > (However, I did try to test it, and it looks like if a leader calls
> > sys_exit() then the whole group goes away; is this actually guaranteed?)
>
> I think so, but maybe not instantaneously.
>
> Paul
>
Hmm, well, should I make this assumption, then? The code would not be
more complicated either way, really. I kind of prefer it as it is...
-- Ben
On Tue, Mar 15, 2011 at 2:13 PM, Ben Blum <[email protected]> wrote:
>
> Hmm, you may be right; my understanding of RCU is not complete. But
> actually I think the BUG_ON should just be removed, since we're about to
> drop locks before handing off to cgroup_attach_proc anyway (so at no
> important part is the assertion guaranteed), which will detect and
> EAGAIN if such a race happened.
Sounds good.
>
> Hmm, well, should I make this assumption, then? The code would not be
> more complicated either way, really. I kind of prefer it as it is...
>
OK, I guess either way is OK until we can prove otherwise :-)
Paul
On Thu, Mar 10, 2011 at 01:18:31AM -0500, Ben Blum wrote:
> > > + ? ? ? ? ? ? ? ? ? ? ? /* optimization for the single-task-only case */
> > > + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> > > + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> > > ? ? ? ? ? ? ? ? ? ? ? ?return -ESRCH;
> > > ? ? ? ? ? ? ? ?}
> > >
> > > + ? ? ? ? ? ? ? /*
> > > + ? ? ? ? ? ? ? ?* even if we're attaching all tasks in the thread group, we
> > > + ? ? ? ? ? ? ? ?* only need to check permissions on one of them.
> > > + ? ? ? ? ? ? ? ?*/
> > > ? ? ? ? ? ? ? ?tcred = __task_cred(tsk);
> > > ? ? ? ? ? ? ? ?if (cred->euid &&
> > > ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->uid &&
> > > ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->suid) {
> > > ? ? ? ? ? ? ? ? ? ? ? ?rcu_read_unlock();
> > > + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> > > ? ? ? ? ? ? ? ? ? ? ? ?return -EACCES;
> >
> > Maybe turn these returns into "goto out;" statements and put the
> > unlock after the out: label?
> >
>
> Maybe; I didn't look too hard at that function. If I revise the patch I
> can do this, though.
Looking back, I think I like it the way it is. Coalescing those unlock
paths would make it less clear... rcu_read is unlocked in the middle of
the function (on the success path), so having a bailout path moves the
failure path far removed from where it's relevant.
-- Ben
On Thu, Mar 03, 2011 at 09:48:09AM -0800, Paul Menage wrote:
> On Wed, Feb 16, 2011 at 11:22 AM, Ben Blum <[email protected]> wrote:
> > Convert cgroup_attach_proc to use flex_array.
> >
> > From: Ben Blum <[email protected]>
> >
> > The cgroup_attach_proc implementation requires a pre-allocated array to store
> > task pointers to atomically move a thread-group, but asking for a monolithic
> > array with kmalloc() may be unreliable for very large groups. Using flex_array
> > provides the same functionality with less risk of failure.
> >
> > This is a post-patch for cgroup-procs-write.patch.
> >
> > Signed-off-by: Ben Blum <[email protected]>
>
> Reviewed-by: Paul Menage <[email protected]>
>
> Looks fine from a correctness point of view, but I'd be inclined to
> reduce the verbosity - rather than
>
> tsk = flex_array_get_ptr(group, i);
> BUG_ON(tsk == NULL);
> retval = ss->can_attach_task(cgrp, tsk);
>
> I'd just have
>
> retval = ss->can_attach_task(cgrp, flex_array_get_ptr(group, i));
>
> I don't think you need to be so defensive about flex_array's behaviour.
>
> Paul
>
hmm, in this case that change would make it cross 80 columns (and I
liked consistency). ;)
I've removed the BUG_ONs, though.
-- Ben
Convert cgroup_attach_proc to use flex_array.
From: Ben Blum <[email protected]>
The cgroup_attach_proc implementation requires a pre-allocated array to store
task pointers to atomically move a thread-group, but asking for a monolithic
array with kmalloc() may be unreliable for very large groups. Using flex_array
provides the same functionality with less risk of failure.
This is a post-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
kernel/cgroup.c | 33 ++++++++++++++++++++++++---------
1 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 273633c..92aa794 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -1986,7 +1987,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
- struct task_struct **group;
+ struct flex_array *group;
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
@@ -2003,9 +2004,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* and if threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
- group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
if (!group)
return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
/* prevent changes to the threadgroup list while we take a snapshot. */
rcu_read_lock();
@@ -2028,7 +2035,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
get_task_struct(tsk);
- group[i] = tsk;
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
@@ -2050,7 +2062,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
if (ss->can_attach_task) {
/* run on each task in the threadgroup. */
for (i = 0; i < group_size; i++) {
- retval = ss->can_attach_task(cgrp, group[i]);
+ tsk = flex_array_get_ptr(group, i);
+ retval = ss->can_attach_task(cgrp, tsk);
if (retval) {
failed_ss = ss;
cancel_failed_ss = true;
@@ -2066,7 +2079,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
/* nothing to do if this task is already in the cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2105,7 +2118,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
ss->pre_attach(cgrp);
}
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
/* leave current thread as it is if it's already there */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2158,10 +2171,12 @@ out_cancel_attach:
}
}
/* clean up the array of referenced threads in the group. */
- for (i = 0; i < group_size; i++)
- put_task_struct(group[i]);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ put_task_struct(tsk);
+ }
out_free_group_list:
- kfree(group);
+ flex_array_free(group);
return retval;
}
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 9 +
kernel/cgroup.c | 441 +++++++++++++++++++++++++++++++++----
2 files changed, 401 insertions(+), 49 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index d3c9a24..92d93d6 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -426,6 +427,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 616f27a..273633c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1726,6 +1726,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1736,11 +1806,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1771,38 +1839,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->pre_attach)
@@ -1812,9 +1851,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1864,49 +1902,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ bool cancel_failed_ss = false;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct task_struct **group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ group[i] = tsk;
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ retval = ss->can_attach_task(cgrp, group[i]);
+ if (retval) {
+ failed_ss = ss;
+ cancel_failed_ss = true;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss) {
+ if (cancel_failed_ss && ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ break;
+ }
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++)
+ put_task_struct(group[i]);
+out_free_group_list:
+ kfree(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * RCU protects this access, since tsk was found in the
+ * tid map. a race with de_thread may cause group_leader
+ * to stop being the leader, but cgroup_attach_proc will
+ * detect it later.
+ */
+ tsk = tsk->group_leader;
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3260,9 +3605,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
On Mon, Mar 21, 2011 at 10:18 PM, Ben Blum <[email protected]> wrote:
> Makes procs file writable to move all threads by tgid at once
>
> From: Ben Blum <[email protected]>
>
> This patch adds functionality that enables users to move all threads in a
> threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> file. This current implementation makes use of a per-threadgroup rwsem that's
> taken for reading in the fork() path to prevent newly forking threads within
> the threadgroup from "escaping" while the move is in progress.
>
> Signed-off-by: Ben Blum <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
OK, I guess this is ready to go in :-)
Paul
> ---
> ?Documentation/cgroups/cgroups.txt | ? ?9 +
> ?kernel/cgroup.c ? ? ? ? ? ? ? ? ? | ?441 +++++++++++++++++++++++++++++++++----
> ?2 files changed, 401 insertions(+), 49 deletions(-)
>
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index d3c9a24..92d93d6 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -236,7 +236,8 @@ containing the following files describing that cgroup:
> ?- cgroup.procs: list of tgids in the cgroup. ?This list is not
> ? ?guaranteed to be sorted or free of duplicate tgids, and userspace
> ? ?should sort/uniquify the list if this property is required.
> - ? This is a read-only file, for now.
> + ? Writing a thread group id into this file moves all threads in that
> + ? group into this cgroup.
> ?- notify_on_release flag: run the release agent on exit?
> ?- release_agent: the path to use for release notifications (this file
> ? ?exists in the top cgroup only)
> @@ -426,6 +427,12 @@ You can attach the current shell task by echoing 0:
>
> ?# echo 0 > tasks
>
> +You can use the cgroup.procs file instead of the tasks file to move all
> +threads in a threadgroup at once. Echoing the pid of any task in a
> +threadgroup to cgroup.procs causes all tasks in that threadgroup to be
> +be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
> +in the writing task's threadgroup.
> +
> ?2.3 Mounting hierarchies by name
> ?--------------------------------
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 616f27a..273633c 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1726,6 +1726,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
> ?}
> ?EXPORT_SYMBOL_GPL(cgroup_path);
>
> +/*
> + * cgroup_task_migrate - move a task from one cgroup to another.
> + *
> + * 'guarantee' is set if the caller promises that a new css_set for the task
> + * will already exist. If not set, this function might sleep, and can fail with
> + * -ENOMEM. Otherwise, it can only fail with -ESRCH.
> + */
> +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct task_struct *tsk, bool guarantee)
> +{
> + ? ? ? struct css_set *oldcg;
> + ? ? ? struct css_set *newcg;
> +
> + ? ? ? /*
> + ? ? ? ?* get old css_set. we need to take task_lock and refcount it, because
> + ? ? ? ?* an exiting task can change its css_set to init_css_set and drop its
> + ? ? ? ?* old one without taking cgroup_mutex.
> + ? ? ? ?*/
> + ? ? ? task_lock(tsk);
> + ? ? ? oldcg = tsk->cgroups;
> + ? ? ? get_css_set(oldcg);
> + ? ? ? task_unlock(tsk);
> +
> + ? ? ? /* locate or allocate a new css_set for this task. */
> + ? ? ? if (guarantee) {
> + ? ? ? ? ? ? ? /* we know the css_set we want already exists. */
> + ? ? ? ? ? ? ? struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> + ? ? ? ? ? ? ? read_lock(&css_set_lock);
> + ? ? ? ? ? ? ? newcg = find_existing_css_set(oldcg, cgrp, template);
> + ? ? ? ? ? ? ? BUG_ON(!newcg);
> + ? ? ? ? ? ? ? get_css_set(newcg);
> + ? ? ? ? ? ? ? read_unlock(&css_set_lock);
> + ? ? ? } else {
> + ? ? ? ? ? ? ? might_sleep();
> + ? ? ? ? ? ? ? /* find_css_set will give us newcg already referenced. */
> + ? ? ? ? ? ? ? newcg = find_css_set(oldcg, cgrp);
> + ? ? ? ? ? ? ? if (!newcg) {
> + ? ? ? ? ? ? ? ? ? ? ? put_css_set(oldcg);
> + ? ? ? ? ? ? ? ? ? ? ? return -ENOMEM;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> + ? ? ? put_css_set(oldcg);
> +
> + ? ? ? /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
> + ? ? ? task_lock(tsk);
> + ? ? ? if (tsk->flags & PF_EXITING) {
> + ? ? ? ? ? ? ? task_unlock(tsk);
> + ? ? ? ? ? ? ? put_css_set(newcg);
> + ? ? ? ? ? ? ? return -ESRCH;
> + ? ? ? }
> + ? ? ? rcu_assign_pointer(tsk->cgroups, newcg);
> + ? ? ? task_unlock(tsk);
> +
> + ? ? ? /* Update the css_set linked lists if we're using them */
> + ? ? ? write_lock(&css_set_lock);
> + ? ? ? if (!list_empty(&tsk->cg_list))
> + ? ? ? ? ? ? ? list_move(&tsk->cg_list, &newcg->tasks);
> + ? ? ? write_unlock(&css_set_lock);
> +
> + ? ? ? /*
> + ? ? ? ?* We just gained a reference on oldcg by taking it from the task. As
> + ? ? ? ?* trading it for newcg is protected by cgroup_mutex, we're safe to drop
> + ? ? ? ?* it here; it will be freed under RCU.
> + ? ? ? ?*/
> + ? ? ? put_css_set(oldcg);
> +
> + ? ? ? set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> + ? ? ? return 0;
> +}
> +
> ?/**
> ?* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
> ?* @cgrp: the cgroup the task is attaching to
> @@ -1736,11 +1806,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
> ?*/
> ?int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ?{
> - ? ? ? int retval = 0;
> + ? ? ? int retval;
> ? ? ? ?struct cgroup_subsys *ss, *failed_ss = NULL;
> ? ? ? ?struct cgroup *oldcgrp;
> - ? ? ? struct css_set *cg;
> - ? ? ? struct css_set *newcg;
> ? ? ? ?struct cgroupfs_root *root = cgrp->root;
>
> ? ? ? ?/* Nothing to do if the task is already in that cgroup */
> @@ -1771,38 +1839,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
>
> - ? ? ? task_lock(tsk);
> - ? ? ? cg = tsk->cgroups;
> - ? ? ? get_css_set(cg);
> - ? ? ? task_unlock(tsk);
> - ? ? ? /*
> - ? ? ? ?* Locate or allocate a new css_set for this task,
> - ? ? ? ?* based on its final set of cgroups
> - ? ? ? ?*/
> - ? ? ? newcg = find_css_set(cg, cgrp);
> - ? ? ? put_css_set(cg);
> - ? ? ? if (!newcg) {
> - ? ? ? ? ? ? ? retval = -ENOMEM;
> - ? ? ? ? ? ? ? goto out;
> - ? ? ? }
> -
> - ? ? ? task_lock(tsk);
> - ? ? ? if (tsk->flags & PF_EXITING) {
> - ? ? ? ? ? ? ? task_unlock(tsk);
> - ? ? ? ? ? ? ? put_css_set(newcg);
> - ? ? ? ? ? ? ? retval = -ESRCH;
> + ? ? ? retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
> + ? ? ? if (retval)
> ? ? ? ? ? ? ? ?goto out;
> - ? ? ? }
> - ? ? ? rcu_assign_pointer(tsk->cgroups, newcg);
> - ? ? ? task_unlock(tsk);
> -
> - ? ? ? /* Update the css_set linked lists if we're using them */
> - ? ? ? write_lock(&css_set_lock);
> - ? ? ? if (!list_empty(&tsk->cg_list)) {
> - ? ? ? ? ? ? ? list_del(&tsk->cg_list);
> - ? ? ? ? ? ? ? list_add(&tsk->cg_list, &newcg->tasks);
> - ? ? ? }
> - ? ? ? write_unlock(&css_set_lock);
>
> ? ? ? ?for_each_subsys(root, ss) {
> ? ? ? ? ? ? ? ?if (ss->pre_attach)
> @@ -1812,9 +1851,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> ? ? ? ? ? ? ? ?if (ss->attach)
> ? ? ? ? ? ? ? ? ? ? ? ?ss->attach(ss, cgrp, oldcgrp, tsk);
> ? ? ? ?}
> - ? ? ? set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
> +
> ? ? ? ?synchronize_rcu();
> - ? ? ? put_css_set(cg);
>
> ? ? ? ?/*
> ? ? ? ? * wake up rmdir() waiter. the rmdir should fail since the cgroup
> @@ -1864,49 +1902,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
> ?EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
>
> ?/*
> - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
> - * held. May take task_lock of task
> + * cgroup_attach_proc works in two stages, the first of which prefetches all
> + * new css_sets needed (to make sure we have enough memory before committing
> + * to the move) and stores them in a list of entries of the following type.
> + * TODO: possible optimization: use css_set->rcu_head for chaining instead
> + */
> +struct cg_list_entry {
> + ? ? ? struct css_set *cg;
> + ? ? ? struct list_head links;
> +};
> +
> +static bool css_set_check_fetched(struct cgroup *cgrp,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct task_struct *tsk, struct css_set *cg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct list_head *newcg_list)
> +{
> + ? ? ? struct css_set *newcg;
> + ? ? ? struct cg_list_entry *cg_entry;
> + ? ? ? struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> +
> + ? ? ? read_lock(&css_set_lock);
> + ? ? ? newcg = find_existing_css_set(cg, cgrp, template);
> + ? ? ? if (newcg)
> + ? ? ? ? ? ? ? get_css_set(newcg);
> + ? ? ? read_unlock(&css_set_lock);
> +
> + ? ? ? /* doesn't exist at all? */
> + ? ? ? if (!newcg)
> + ? ? ? ? ? ? ? return false;
> + ? ? ? /* see if it's already in the list */
> + ? ? ? list_for_each_entry(cg_entry, newcg_list, links) {
> + ? ? ? ? ? ? ? if (cg_entry->cg == newcg) {
> + ? ? ? ? ? ? ? ? ? ? ? put_css_set(newcg);
> + ? ? ? ? ? ? ? ? ? ? ? return true;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +
> + ? ? ? /* not found */
> + ? ? ? put_css_set(newcg);
> + ? ? ? return false;
> +}
> +
> +/*
> + * Find the new css_set and store it in the list in preparation for moving the
> + * given task to the given cgroup. Returns 0 or -ENOMEM.
> + */
> +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? struct list_head *newcg_list)
> +{
> + ? ? ? struct css_set *newcg;
> + ? ? ? struct cg_list_entry *cg_entry;
> +
> + ? ? ? /* ensure a new css_set will exist for this thread */
> + ? ? ? newcg = find_css_set(cg, cgrp);
> + ? ? ? if (!newcg)
> + ? ? ? ? ? ? ? return -ENOMEM;
> + ? ? ? /* add it to the list */
> + ? ? ? cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
> + ? ? ? if (!cg_entry) {
> + ? ? ? ? ? ? ? put_css_set(newcg);
> + ? ? ? ? ? ? ? return -ENOMEM;
> + ? ? ? }
> + ? ? ? cg_entry->cg = newcg;
> + ? ? ? list_add(&cg_entry->links, newcg_list);
> + ? ? ? return 0;
> +}
> +
> +/**
> + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
> + * @cgrp: the cgroup to attach to
> + * @leader: the threadgroup leader task_struct of the group to be attached
> + *
> + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
> + * take task_lock of each thread in leader's threadgroup individually in turn.
> + */
> +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> +{
> + ? ? ? int retval, i, group_size;
> + ? ? ? struct cgroup_subsys *ss, *failed_ss = NULL;
> + ? ? ? bool cancel_failed_ss = false;
> + ? ? ? /* guaranteed to be initialized later, but the compiler needs this */
> + ? ? ? struct cgroup *oldcgrp = NULL;
> + ? ? ? struct css_set *oldcg;
> + ? ? ? struct cgroupfs_root *root = cgrp->root;
> + ? ? ? /* threadgroup list cursor and array */
> + ? ? ? struct task_struct *tsk;
> + ? ? ? struct task_struct **group;
> + ? ? ? /*
> + ? ? ? ?* we need to make sure we have css_sets for all the tasks we're
> + ? ? ? ?* going to move -before- we actually start moving them, so that in
> + ? ? ? ?* case we get an ENOMEM we can bail out before making any changes.
> + ? ? ? ?*/
> + ? ? ? struct list_head newcg_list;
> + ? ? ? struct cg_list_entry *cg_entry, *temp_nobe;
> +
> + ? ? ? /*
> + ? ? ? ?* step 0: in order to do expensive, possibly blocking operations for
> + ? ? ? ?* every thread, we cannot iterate the thread group list, since it needs
> + ? ? ? ?* rcu or tasklist locked. instead, build an array of all threads in the
> + ? ? ? ?* group - threadgroup_fork_lock prevents new threads from appearing,
> + ? ? ? ?* and if threads exit, this will just be an over-estimate.
> + ? ? ? ?*/
> + ? ? ? group_size = get_nr_threads(leader);
> + ? ? ? group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
> + ? ? ? if (!group)
> + ? ? ? ? ? ? ? return -ENOMEM;
> +
> + ? ? ? /* prevent changes to the threadgroup list while we take a snapshot. */
> + ? ? ? rcu_read_lock();
> + ? ? ? if (!thread_group_leader(leader)) {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* a race with de_thread from another thread's exec() may strip
> + ? ? ? ? ? ? ? ?* us of our leadership, making while_each_thread unsafe to use
> + ? ? ? ? ? ? ? ?* on this task. if this happens, there is no choice but to
> + ? ? ? ? ? ? ? ?* throw this task away and try again (from cgroup_procs_write);
> + ? ? ? ? ? ? ? ?* this is "double-double-toil-and-trouble-check locking".
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? rcu_read_unlock();
> + ? ? ? ? ? ? ? retval = -EAGAIN;
> + ? ? ? ? ? ? ? goto out_free_group_list;
> + ? ? ? }
> + ? ? ? /* take a reference on each task in the group to go in the array. */
> + ? ? ? tsk = leader;
> + ? ? ? i = 0;
> + ? ? ? do {
> + ? ? ? ? ? ? ? /* as per above, nr_threads may decrease, but not increase. */
> + ? ? ? ? ? ? ? BUG_ON(i >= group_size);
> + ? ? ? ? ? ? ? get_task_struct(tsk);
> + ? ? ? ? ? ? ? group[i] = tsk;
> + ? ? ? ? ? ? ? i++;
> + ? ? ? } while_each_thread(leader, tsk);
> + ? ? ? /* remember the number of threads in the array for later. */
> + ? ? ? group_size = i;
> + ? ? ? rcu_read_unlock();
> +
> + ? ? ? /*
> + ? ? ? ?* step 1: check that we can legitimately attach to the cgroup.
> + ? ? ? ?*/
> + ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? if (ss->can_attach) {
> + ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach(ss, cgrp, leader);
> + ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out_cancel_attach;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? /* a callback to be run on every thread in the threadgroup. */
> + ? ? ? ? ? ? ? if (ss->can_attach_task) {
> + ? ? ? ? ? ? ? ? ? ? ? /* run on each task in the threadgroup. */
> + ? ? ? ? ? ? ? ? ? ? ? for (i = 0; i < group_size; i++) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? retval = ss->can_attach_task(cgrp, group[i]);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (retval) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? failed_ss = ss;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? cancel_failed_ss = true;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out_cancel_attach;
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +
> + ? ? ? /*
> + ? ? ? ?* step 2: make sure css_sets exist for all threads to be migrated.
> + ? ? ? ?* we use find_css_set, which allocates a new one if necessary.
> + ? ? ? ?*/
> + ? ? ? INIT_LIST_HEAD(&newcg_list);
> + ? ? ? for (i = 0; i < group_size; i++) {
> + ? ? ? ? ? ? ? tsk = group[i];
> + ? ? ? ? ? ? ? /* nothing to do if this task is already in the cgroup */
> + ? ? ? ? ? ? ? oldcgrp = task_cgroup_from_root(tsk, root);
> + ? ? ? ? ? ? ? if (cgrp == oldcgrp)
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? /* get old css_set pointer */
> + ? ? ? ? ? ? ? task_lock(tsk);
> + ? ? ? ? ? ? ? if (tsk->flags & PF_EXITING) {
> + ? ? ? ? ? ? ? ? ? ? ? /* ignore this task if it's going away */
> + ? ? ? ? ? ? ? ? ? ? ? task_unlock(tsk);
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? oldcg = tsk->cgroups;
> + ? ? ? ? ? ? ? get_css_set(oldcg);
> + ? ? ? ? ? ? ? task_unlock(tsk);
> + ? ? ? ? ? ? ? /* see if the new one for us is already in the list? */
> + ? ? ? ? ? ? ? if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
> + ? ? ? ? ? ? ? ? ? ? ? /* was already there, nothing to do. */
> + ? ? ? ? ? ? ? ? ? ? ? put_css_set(oldcg);
> + ? ? ? ? ? ? ? } else {
> + ? ? ? ? ? ? ? ? ? ? ? /* we don't already have it. get new one. */
> + ? ? ? ? ? ? ? ? ? ? ? retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
> + ? ? ? ? ? ? ? ? ? ? ? put_css_set(oldcg);
> + ? ? ? ? ? ? ? ? ? ? ? if (retval)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto out_list_teardown;
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> +
> + ? ? ? /*
> + ? ? ? ?* step 3: now that we're guaranteed success wrt the css_sets, proceed
> + ? ? ? ?* to move all tasks to the new cgroup, calling ss->attach_task for each
> + ? ? ? ?* one along the way. there are no failure cases after here, so this is
> + ? ? ? ?* the commit point.
> + ? ? ? ?*/
> + ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? if (ss->pre_attach)
> + ? ? ? ? ? ? ? ? ? ? ? ss->pre_attach(cgrp);
> + ? ? ? }
> + ? ? ? for (i = 0; i < group_size; i++) {
> + ? ? ? ? ? ? ? tsk = group[i];
> + ? ? ? ? ? ? ? /* leave current thread as it is if it's already there */
> + ? ? ? ? ? ? ? oldcgrp = task_cgroup_from_root(tsk, root);
> + ? ? ? ? ? ? ? if (cgrp == oldcgrp)
> + ? ? ? ? ? ? ? ? ? ? ? continue;
> + ? ? ? ? ? ? ? /* attach each task to each subsystem */
> + ? ? ? ? ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? ? ? ? ? if (ss->attach_task)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ss->attach_task(cgrp, tsk);
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? /* if the thread is PF_EXITING, it can just get skipped. */
> + ? ? ? ? ? ? ? retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
> + ? ? ? ? ? ? ? BUG_ON(retval != 0 && retval != -ESRCH);
> + ? ? ? }
> + ? ? ? /* nothing is sensitive to fork() after this point. */
> +
> + ? ? ? /*
> + ? ? ? ?* step 4: do expensive, non-thread-specific subsystem callbacks.
> + ? ? ? ?* TODO: if ever a subsystem needs to know the oldcgrp for each task
> + ? ? ? ?* being moved, this call will need to be reworked to communicate that.
> + ? ? ? ?*/
> + ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? if (ss->attach)
> + ? ? ? ? ? ? ? ? ? ? ? ss->attach(ss, cgrp, oldcgrp, leader);
> + ? ? ? }
> +
> + ? ? ? /*
> + ? ? ? ?* step 5: success! and cleanup
> + ? ? ? ?*/
> + ? ? ? synchronize_rcu();
> + ? ? ? cgroup_wakeup_rmdir_waiter(cgrp);
> + ? ? ? retval = 0;
> +out_list_teardown:
> + ? ? ? /* clean up the list of prefetched css_sets. */
> + ? ? ? list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
> + ? ? ? ? ? ? ? list_del(&cg_entry->links);
> + ? ? ? ? ? ? ? put_css_set(cg_entry->cg);
> + ? ? ? ? ? ? ? kfree(cg_entry);
> + ? ? ? }
> +out_cancel_attach:
> + ? ? ? /* same deal as in cgroup_attach_task */
> + ? ? ? if (retval) {
> + ? ? ? ? ? ? ? for_each_subsys(root, ss) {
> + ? ? ? ? ? ? ? ? ? ? ? if (ss == failed_ss) {
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? if (cancel_failed_ss && ss->cancel_attach)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ss->cancel_attach(ss, cgrp, leader);
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
> + ? ? ? ? ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? ? ? ? ? if (ss->cancel_attach)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ss->cancel_attach(ss, cgrp, leader);
> + ? ? ? ? ? ? ? }
> + ? ? ? }
> + ? ? ? /* clean up the array of referenced threads in the group. */
> + ? ? ? for (i = 0; i < group_size; i++)
> + ? ? ? ? ? ? ? put_task_struct(group[i]);
> +out_free_group_list:
> + ? ? ? kfree(group);
> + ? ? ? return retval;
> +}
> +
> +/*
> + * Find the task_struct of the task to attach by vpid and pass it along to the
> + * function to attach either it or all tasks in its threadgroup. Will take
> + * cgroup_mutex; may take task_lock of task.
> ?*/
> -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
> +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
> ?{
> ? ? ? ?struct task_struct *tsk;
> ? ? ? ?const struct cred *cred = current_cred(), *tcred;
> ? ? ? ?int ret;
>
> + ? ? ? if (!cgroup_lock_live_group(cgrp))
> + ? ? ? ? ? ? ? return -ENODEV;
> +
> ? ? ? ?if (pid) {
> ? ? ? ? ? ? ? ?rcu_read_lock();
> ? ? ? ? ? ? ? ?tsk = find_task_by_vpid(pid);
> - ? ? ? ? ? ? ? if (!tsk || tsk->flags & PF_EXITING) {
> + ? ? ? ? ? ? ? if (!tsk) {
> ? ? ? ? ? ? ? ? ? ? ? ?rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? return -ESRCH;
> + ? ? ? ? ? ? ? }
> + ? ? ? ? ? ? ? if (threadgroup) {
> + ? ? ? ? ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ? ? ? ? ?* RCU protects this access, since tsk was found in the
> + ? ? ? ? ? ? ? ? ? ? ? ?* tid map. a race with de_thread may cause group_leader
> + ? ? ? ? ? ? ? ? ? ? ? ?* to stop being the leader, but cgroup_attach_proc will
> + ? ? ? ? ? ? ? ? ? ? ? ?* detect it later.
> + ? ? ? ? ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ? ? ? ? tsk = tsk->group_leader;
> + ? ? ? ? ? ? ? } else if (tsk->flags & PF_EXITING) {
> + ? ? ? ? ? ? ? ? ? ? ? /* optimization for the single-task-only case */
> + ? ? ? ? ? ? ? ? ? ? ? rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> ? ? ? ? ? ? ? ? ? ? ? ?return -ESRCH;
> ? ? ? ? ? ? ? ?}
>
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* even if we're attaching all tasks in the thread group, we
> + ? ? ? ? ? ? ? ?* only need to check permissions on one of them.
> + ? ? ? ? ? ? ? ?*/
> ? ? ? ? ? ? ? ?tcred = __task_cred(tsk);
> ? ? ? ? ? ? ? ?if (cred->euid &&
> ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->uid &&
> ? ? ? ? ? ? ? ? ? ?cred->euid != tcred->suid) {
> ? ? ? ? ? ? ? ? ? ? ? ?rcu_read_unlock();
> + ? ? ? ? ? ? ? ? ? ? ? cgroup_unlock();
> ? ? ? ? ? ? ? ? ? ? ? ?return -EACCES;
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?get_task_struct(tsk);
> ? ? ? ? ? ? ? ?rcu_read_unlock();
> ? ? ? ?} else {
> - ? ? ? ? ? ? ? tsk = current;
> + ? ? ? ? ? ? ? if (threadgroup)
> + ? ? ? ? ? ? ? ? ? ? ? tsk = current->group_leader;
> + ? ? ? ? ? ? ? else
> + ? ? ? ? ? ? ? ? ? ? ? tsk = current;
> ? ? ? ? ? ? ? ?get_task_struct(tsk);
> ? ? ? ?}
>
> - ? ? ? ret = cgroup_attach_task(cgrp, tsk);
> + ? ? ? if (threadgroup) {
> + ? ? ? ? ? ? ? threadgroup_fork_write_lock(tsk);
> + ? ? ? ? ? ? ? ret = cgroup_attach_proc(cgrp, tsk);
> + ? ? ? ? ? ? ? threadgroup_fork_write_unlock(tsk);
> + ? ? ? } else {
> + ? ? ? ? ? ? ? ret = cgroup_attach_task(cgrp, tsk);
> + ? ? ? }
> ? ? ? ?put_task_struct(tsk);
> + ? ? ? cgroup_unlock();
> ? ? ? ?return ret;
> ?}
>
> ?static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
> ?{
> + ? ? ? return attach_task_by_pid(cgrp, pid, false);
> +}
> +
> +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
> +{
> ? ? ? ?int ret;
> - ? ? ? if (!cgroup_lock_live_group(cgrp))
> - ? ? ? ? ? ? ? return -ENODEV;
> - ? ? ? ret = attach_task_by_pid(cgrp, pid);
> - ? ? ? cgroup_unlock();
> + ? ? ? do {
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* attach_proc fails with -EAGAIN if threadgroup leadership
> + ? ? ? ? ? ? ? ?* changes in the middle of the operation, in which case we need
> + ? ? ? ? ? ? ? ?* to find the task_struct for the new leader and start over.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? ret = attach_task_by_pid(cgrp, tgid, true);
> + ? ? ? } while (ret == -EAGAIN);
> ? ? ? ?return ret;
> ?}
>
> @@ -3260,9 +3605,9 @@ static struct cftype files[] = {
> ? ? ? ?{
> ? ? ? ? ? ? ? ?.name = CGROUP_FILE_GENERIC_PREFIX "procs",
> ? ? ? ? ? ? ? ?.open = cgroup_procs_open,
> - ? ? ? ? ? ? ? /* .write_u64 = cgroup_procs_write, TODO */
> + ? ? ? ? ? ? ? .write_u64 = cgroup_procs_write,
> ? ? ? ? ? ? ? ?.release = cgroup_pidlist_release,
> - ? ? ? ? ? ? ? .mode = S_IRUGO,
> + ? ? ? ? ? ? ? .mode = S_IRUGO | S_IWUSR,
> ? ? ? ?},
> ? ? ? ?{
> ? ? ? ? ? ? ? ?.name = "notify_on_release",
>
On Tue, 29 Mar 2011 16:27:19 -0700
Paul Menage <[email protected]> wrote:
> On Mon, Mar 21, 2011 at 10:18 PM, Ben Blum <[email protected]> wrote:
> > Makes procs file writable to move all threads by tgid at once
> >
> > From: Ben Blum <[email protected]>
> >
> > This patch adds functionality that enables users to move all threads in a
> > threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
> > file. This current implementation makes use of a per-threadgroup rwsem that's
> > taken for reading in the fork() path to prevent newly forking threads within
> > the threadgroup from "escaping" while the move is in progress.
> >
> > Signed-off-by: Ben Blum <[email protected]>
>
> Reviewed-by: Paul Menage <[email protected]>
>
> OK, I guess this is ready to go in :-)
It all needs a refresh, retest and resend please.
On Mon, Feb 07, 2011 at 08:35:42PM -0500, Ben Blum wrote:
> On Sun, Dec 26, 2010 at 07:09:19AM -0500, Ben Blum wrote:
> > On Fri, Dec 24, 2010 at 03:22:26AM -0500, Ben Blum wrote:
> > > On Wed, Aug 11, 2010 at 01:46:04AM -0400, Ben Blum wrote:
> > > > On Fri, Jul 30, 2010 at 07:56:49PM -0400, Ben Blum wrote:
> > > > > This patch series is a revision of http://lkml.org/lkml/2010/6/25/11 .
> > > > >
> > > > > This patch series implements a write function for the 'cgroup.procs'
> > > > > per-cgroup file, which enables atomic movement of multithreaded
> > > > > applications between cgroups. Writing the thread-ID of any thread in a
> > > > > threadgroup to a cgroup's procs file causes all threads in the group to
> > > > > be moved to that cgroup safely with respect to threads forking/exiting.
> > > > > (Possible usage scenario: If running a multithreaded build system that
> > > > > sucks up system resources, this lets you restrict it all at once into a
> > > > > new cgroup to keep it under control.)
> > > > >
> > > > > Example: Suppose pid 31337 clones new threads 31338 and 31339.
> > > > >
> > > > > # cat /dev/cgroup/tasks
> > > > > ...
> > > > > 31337
> > > > > 31338
> > > > > 31339
> > > > > # mkdir /dev/cgroup/foo
> > > > > # echo 31337 > /dev/cgroup/foo/cgroup.procs
> > > > > # cat /dev/cgroup/foo/tasks
> > > > > 31337
> > > > > 31338
> > > > > 31339
> > > > >
> > > > > A new lock, called threadgroup_fork_lock and living in signal_struct, is
> > > > > introduced to ensure atomicity when moving threads between cgroups. It's
> > > > > taken for writing during the operation, and taking for reading in fork()
> > > > > around the calls to cgroup_fork() and cgroup_post_fork().
> >
> > Well this time everything here is actually safe and correct, as far as
> > my best efforts and keen eyes can tell. I dropped the per_thread call
> > from the last series in favour of revising the subsystem callback
> > interface. It now looks like this:
> >
> > ss->can_attach()
> > - Thread-independent, possibly expensive/sleeping.
> >
> > ss->can_attach_task()
> > - Called per-thread, run with rcu_read so must not sleep.
> >
> > ss->pre_attach()
> > - Thread independent, must be atomic, happens before attach_task.
> >
> > ss->attach_task()
> > - Called per-thread, run with tasklist_lock so must not sleep.
> >
> > ss->attach()
> > - Thread independent, possibly expensive/sleeping, called last.
>
> Okay, so.
>
> I've revamped the cgroup_attach_proc implementation a bunch and this
> version should be a lot easier on the eyes (and brains). Issues that are
> addressed:
>
> 1) cgroup_attach_proc now iterates over leader->thread_group once, at
> the very beginning, and puts each task_struct that we want to move
> into an array, using get_task_struct to make sure they stick around.
> - threadgroup_fork_lock ensures no threads not in the array can
> appear, and allows us to use signal->nr_threads to determine the
> size of the array when kmallocing it.
> - This simplifies the rest of the function a bunch, since now we
> never need to do rcu_read_lock after building the array. All the
> subsystem callbacks are the same as described just above, but the
> "can't sleep" restriction is gone, so it's nice and clean.
> - Checking for a race with de_thread (the manoeuvre I refer to as
> "double-double-toil-and-trouble-check locking") now needs to be
> done only once, at the beginning (before building the array).
>
> 2) The nodemask allocation problem in cpuset is fixed the same way as
> before - the masks are shared between the three attach callbacks, so
> are made as static global variables.
>
> 3) The introduction of threadgroup_fork_lock in sched.h (specifically,
> in signal_struct) requires rwsem.h; the new include appears in the
> first patch. (An alternate plan would be to make it a struct pointer
> with an incomplete forward declaration and kmalloc/kfree it during
> housekeeping, but adding an include seems better than that particular
> complication.) In light of this, the definitions for
> threadgroup_fork_{read,write}_{un,}lock are also in sched.h.
Same as before; using flex_array in attach_proc (thanks Kame).
-- Ben
---
Documentation/cgroups/cgroups.txt | 39 ++-
block/blk-cgroup.c | 18 -
include/linux/cgroup.h | 10
include/linux/init_task.h | 9
include/linux/sched.h | 36 ++
kernel/cgroup.c | 489 +++++++++++++++++++++++++++++++++-----
kernel/cgroup_freezer.c | 26 --
kernel/cpuset.c | 96 +++----
kernel/fork.c | 10
kernel/sched.c | 38 --
mm/memcontrol.c | 18 -
security/device_cgroup.c | 3
12 files changed, 594 insertions(+), 198 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's signal_struct that's
taken for reading in the fork path, under CONFIG_CGROUPS. If another part of
the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS
ifdefs should be changed to a higher-up flag that CGROUPS and the other system
would both depend on.
This is a pre-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 36 ++++++++++++++++++++++++++++++++++++
kernel/fork.c | 10 ++++++++++
3 files changed, 55 insertions(+), 0 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index caa151f..7bf5257 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -22,6 +22,14 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
+#ifdef CONFIG_CGROUPS
+#define INIT_THREADGROUP_FORK_LOCK(sig) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sig.threadgroup_fork_lock),
+#else
+#define INIT_THREADGROUP_FORK_LOCK(sig)
+#endif
+
#define INIT_SIGNALS(sig) { \
.nr_threads = 1, \
.wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\
@@ -38,6 +46,7 @@ extern struct fs_struct init_fs;
}, \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
+ INIT_THREADGROUP_FORK_LOCK(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3509d00..a219c69 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -514,6 +514,7 @@ struct thread_group_cputimer {
spinlock_t lock;
};
+#include <linux/rwsem.h>
struct autogroup;
/*
@@ -633,6 +634,16 @@ struct signal_struct {
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock prevents threads from forking with
+ * CLONE_THREAD while held for writing. Use this for fork-sensitive
+ * threadgroup-wide operations. It's taken for reading in fork.c in
+ * copy_process().
+ * Currently only needed write-side by cgroups.
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
@@ -2307,6 +2318,31 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+/* See the declaration of threadgroup_fork_lock in signal_struct. */
+#ifdef CONFIG_CGROUPS
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk)
+{
+ down_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk)
+{
+ up_read(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk)
+{
+ down_write(&tsk->signal->threadgroup_fork_lock);
+}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk)
+{
+ up_write(&tsk->signal->threadgroup_fork_lock);
+}
+#else
+static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {}
+static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {}
+#endif
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/kernel/fork.c b/kernel/fork.c
index 41d2062..aef33ac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -927,6 +927,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1109,6 +1113,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1307,6 +1313,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1345,6 +1353,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
Add cgroup subsystem callbacks for per-thread attachment in atomic contexts
From: Ben Blum <[email protected]>
This patch adds can_attach_task, pre_attach, and attach_task as new callbacks
for cgroups's subsystem interface. Unlike can_attach and attach, these are for
per-thread operations, to be called potentially many times when attaching an
entire threadgroup.
Also, the old "bool threadgroup" interface is removed, as replaced by this.
All subsystems are modified for the new interface - of note is cpuset, which
requires from/to nodemasks for attach to be globally scoped (though per-cpuset
would work too) to persist from its pre_attach to attach_task and attach.
This is a pre-patch for cgroup-procs-writable.patch.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 30 ++++++++----
block/blk-cgroup.c | 18 ++-----
include/linux/cgroup.h | 10 ++--
kernel/cgroup.c | 17 +++++--
kernel/cgroup_freezer.c | 26 ++++------
kernel/cpuset.c | 96 ++++++++++++++++++-------------------
kernel/sched.c | 38 +--------------
mm/memcontrol.c | 18 ++-----
security/device_cgroup.c | 3 -
9 files changed, 114 insertions(+), 142 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 2a5d137..4b0377c 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -575,7 +575,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
called multiple times against a cgroup.
int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
(cgroup_mutex held by caller)
Called prior to moving a task into a cgroup; if the subsystem
@@ -584,9 +584,14 @@ task is passed, then a successful result indicates that *any*
unspecified task can be moved into the cgroup. Note that this isn't
called on a fork. If this method returns 0 (success) then this should
remain valid while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future. If threadgroup is
-true, then a successful result indicates that all threads in the given
-thread's threadgroup can be moved together.
+attach() or cancel_attach() will be called in future.
+
+int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As can_attach, but for operations that must be run once per task to be
+attached (possibly many when using cgroup_attach_proc). Called after
+can_attach.
void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *task, bool threadgroup)
@@ -598,15 +603,24 @@ function, so that the subsystem can implement a rollback. If not, not necessary.
This will be called only about subsystems whose can_attach() operation have
succeeded.
+void pre_attach(struct cgroup *cgrp);
+(cgroup_mutex held by caller)
+
+For any non-per-thread attachment work that needs to happen before
+attach_task. Needed by cpuset.
+
void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *old_cgrp, struct task_struct *task)
(cgroup_mutex held by caller)
Called after the task has been attached to the cgroup, to allow any
post-attachment activity that requires memory allocations or blocking.
-If threadgroup is true, the subsystem should take care of all threads
-in the specified thread's threadgroup. Currently does not support any
+
+void attach_task(struct cgroup *cgrp, struct task_struct *tsk);
+(cgroup_mutex held by caller)
+
+As attach, but for operations that must be run once per task to be attached,
+like can_attach_task. Called before attach. Currently does not support any
subsystem that might need the old_cgrp for every thread in the group.
void fork(struct cgroup_subsy *ss, struct task_struct *task)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2bef570..23d03fb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,10 +30,8 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
- struct task_struct *, bool);
-static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
- struct cgroup *, struct task_struct *, bool);
+static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
+static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -46,8 +44,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach = blkiocg_can_attach,
- .attach = blkiocg_attach,
+ .can_attach_task = blkiocg_can_attach_task,
+ .attach_task = blkiocg_attach_task,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
@@ -1485,9 +1483,7 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach(struct cgroup_subsys *subsys,
- struct cgroup *cgroup, struct task_struct *tsk,
- bool threadgroup)
+static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
int ret = 0;
@@ -1502,9 +1498,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
return ret;
}
-static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
- struct cgroup *prev, struct task_struct *tsk,
- bool threadgroup)
+static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
struct io_context *ioc;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f9f7e3a..919c32c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -467,12 +467,14 @@ struct cgroup_subsys {
int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup);
+ struct task_struct *tsk);
+ void (*pre_attach)(struct cgroup *cgrp);
+ void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cgrp, struct task_struct *tsk,
- bool threadgroup);
+ struct cgroup *old_cgrp, struct task_struct *tsk);
void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *task);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index be1ebeb..1f4037f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1772,7 +1772,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1784,6 +1784,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
task_lock(tsk);
@@ -1818,8 +1825,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
@@ -1842,7 +1853,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7..e691818 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 236a3d3..c1e1e1d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, tsk);
-
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- static nodemask_t to; /* protected by cgroup_mutex */
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, &to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, &to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, &to, cs);
- }
- rcu_read_unlock();
- }
-
- /* change mm; only needs to be done once even if threadgroup */
- to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, &to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
}
@@ -1910,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
diff --git a/kernel/sched.c b/kernel/sched.c
index f592ce6..28aa791 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9059,42 +9059,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
static void
@@ -9182,8 +9150,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd689f2..d5202d1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5035,8 +5035,7 @@ static void mem_cgroup_clear_mc(void)
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
int ret = 0;
struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -5075,8 +5074,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
mem_cgroup_clear_mc();
}
@@ -5194,8 +5192,7 @@ retry:
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
struct mm_struct *mm;
@@ -5213,22 +5210,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
struct cgroup *cgroup,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cont,
struct cgroup *old_cont,
- struct task_struct *p,
- bool threadgroup)
+ struct task_struct *p)
{
}
#endif
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 8d9c48f..cd1f779 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -62,8 +62,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
struct cgroup_subsys devices_subsys;
static int devcgroup_can_attach(struct cgroup_subsys *ss,
- struct cgroup *new_cgroup, struct task_struct *task,
- bool threadgroup)
+ struct cgroup *new_cgroup, struct task_struct *task)
{
if (current != task && !capable(CAP_SYS_ADMIN))
return -EPERM;
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 9 +
kernel/cgroup.c | 439 +++++++++++++++++++++++++++++++++----
2 files changed, 401 insertions(+), 47 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 4b0377c..166f6e3 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -236,7 +236,8 @@ containing the following files describing that cgroup:
- cgroup.procs: list of tgids in the cgroup. This list is not
guaranteed to be sorted or free of duplicate tgids, and userspace
should sort/uniquify the list if this property is required.
- This is a read-only file, for now.
+ Writing a thread group id into this file moves all threads in that
+ group into this cgroup.
- notify_on_release flag: run the release agent on exit?
- release_agent: the path to use for release notifications (this file
exists in the top cgroup only)
@@ -430,6 +431,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
Note: Since every task is always a member of exactly one cgroup in each
mounted hierarchy, to remove a task from its current cgroup you must
move it into a new cgroup (possibly the root cgroup) by writing to the
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1f4037f..52dfb33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1748,6 +1748,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1828,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1793,36 +1861,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list))
- list_move(&tsk->cg_list, &newcg->tasks);
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
if (ss->pre_attach)
@@ -1832,9 +1873,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1884,49 +1924,356 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ bool cancel_failed_ss = false;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct task_struct **group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ group[i] = tsk;
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ retval = ss->can_attach_task(cgrp, group[i]);
+ if (retval) {
+ failed_ss = ss;
+ cancel_failed_ss = true;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = group[i];
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss) {
+ if (cancel_failed_ss && ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ break;
+ }
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++)
+ put_task_struct(group[i]);
+out_free_group_list:
+ kfree(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
+ */
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * RCU protects this access, since tsk was found in the
+ * tid map. a race with de_thread may cause group_leader
+ * to stop being the leader, but cgroup_attach_proc will
+ * detect it later.
+ */
+ tsk = tsk->group_leader;
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
+ rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3283,9 +3630,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
Convert cgroup_attach_proc to use flex_array.
From: Ben Blum <[email protected]>
The cgroup_attach_proc implementation requires a pre-allocated array to store
task pointers to atomically move a thread-group, but asking for a monolithic
array with kmalloc() may be unreliable for very large groups. Using flex_array
provides the same functionality with less risk of failure.
This is a post-patch for cgroup-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
kernel/cgroup.c | 33 ++++++++++++++++++++++++---------
1 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 52dfb33..8236895 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -2008,7 +2009,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
- struct task_struct **group;
+ struct flex_array *group;
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
@@ -2025,9 +2026,15 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* and if threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
- group = kmalloc(group_size * sizeof(*group), GFP_KERNEL);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
if (!group)
return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
/* prevent changes to the threadgroup list while we take a snapshot. */
rcu_read_lock();
@@ -2050,7 +2057,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
get_task_struct(tsk);
- group[i] = tsk;
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
@@ -2072,7 +2084,8 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
if (ss->can_attach_task) {
/* run on each task in the threadgroup. */
for (i = 0; i < group_size; i++) {
- retval = ss->can_attach_task(cgrp, group[i]);
+ tsk = flex_array_get_ptr(group, i);
+ retval = ss->can_attach_task(cgrp, tsk);
if (retval) {
failed_ss = ss;
cancel_failed_ss = true;
@@ -2088,7 +2101,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
/* nothing to do if this task is already in the cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2127,7 +2140,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
ss->pre_attach(cgrp);
}
for (i = 0; i < group_size; i++) {
- tsk = group[i];
+ tsk = flex_array_get_ptr(group, i);
/* leave current thread as it is if it's already there */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
@@ -2180,10 +2193,12 @@ out_cancel_attach:
}
}
/* clean up the array of referenced threads in the group. */
- for (i = 0; i < group_size; i++)
- put_task_struct(group[i]);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ put_task_struct(tsk);
+ }
out_free_group_list:
- kfree(group);
+ flex_array_free(group);
return retval;
}
On Wed, 6 Apr 2011 15:44:20 -0400
Ben Blum <[email protected]> wrote:
> Same as before; using flex_array in attach_proc (thanks Kame).
>
> -- Ben
>
> ---
> Documentation/cgroups/cgroups.txt | 39 ++-
> block/blk-cgroup.c | 18 -
> include/linux/cgroup.h | 10
> include/linux/init_task.h | 9
> include/linux/sched.h | 36 ++
> kernel/cgroup.c | 489 +++++++++++++++++++++++++++++++++-----
> kernel/cgroup_freezer.c | 26 --
> kernel/cpuset.c | 96 +++----
> kernel/fork.c | 10
> kernel/sched.c | 38 --
> mm/memcontrol.c | 18 -
> security/device_cgroup.c | 3
> 12 files changed, 594 insertions(+), 198 deletions(-)
So where are we up to with all this.
I'm surprised that none of the patches had anyone's Acked-by: or
Reviewed-by:. Were they really that mean to you, or have you not been
tracking these?
On Tue, Apr 12, 2011 at 04:25:16PM -0700, Andrew Morton wrote:
> On Wed, 6 Apr 2011 15:44:20 -0400
> Ben Blum <[email protected]> wrote:
>
> > Same as before; using flex_array in attach_proc (thanks Kame).
> >
> > -- Ben
> >
> > ---
> > Documentation/cgroups/cgroups.txt | 39 ++-
> > block/blk-cgroup.c | 18 -
> > include/linux/cgroup.h | 10
> > include/linux/init_task.h | 9
> > include/linux/sched.h | 36 ++
> > kernel/cgroup.c | 489 +++++++++++++++++++++++++++++++++-----
> > kernel/cgroup_freezer.c | 26 --
> > kernel/cpuset.c | 96 +++----
> > kernel/fork.c | 10
> > kernel/sched.c | 38 --
> > mm/memcontrol.c | 18 -
> > security/device_cgroup.c | 3
> > 12 files changed, 594 insertions(+), 198 deletions(-)
>
> So where are we up to with all this.
done and good to go, hopefully? :O
>
> I'm surprised that none of the patches had anyone's Acked-by: or
> Reviewed-by:. Were they really that mean to you, or have you not been
> tracking these?
>
>
Oh, eep. I didn't think to put them there myself; I guess I was assuming
they'd either be implicit or that my reviewers would have something more
to say.
Thanks!
-- Ben
Andrew Morton wrote:
> On Wed, 6 Apr 2011 15:44:20 -0400
> Ben Blum <[email protected]> wrote:
>
>> Same as before; using flex_array in attach_proc (thanks Kame).
>>
>> -- Ben
>>
>> ---
>> Documentation/cgroups/cgroups.txt | 39 ++-
>> block/blk-cgroup.c | 18 -
>> include/linux/cgroup.h | 10
>> include/linux/init_task.h | 9
>> include/linux/sched.h | 36 ++
>> kernel/cgroup.c | 489 +++++++++++++++++++++++++++++++++-----
>> kernel/cgroup_freezer.c | 26 --
>> kernel/cpuset.c | 96 +++----
>> kernel/fork.c | 10
>> kernel/sched.c | 38 --
>> mm/memcontrol.c | 18 -
>> security/device_cgroup.c | 3
>> 12 files changed, 594 insertions(+), 198 deletions(-)
>
> So where are we up to with all this.
>
> I'm surprised that none of the patches had anyone's Acked-by: or
> Reviewed-by:. Were they really that mean to you, or have you not been
> tracking these?
>
>
Paul reviewed the patchset and explicitly gave his reviewed-by tag for all
the 3 pathces.
And I'm going to do some testing for it.