On Sat, Aug 22, 2009 at 03:09:52PM +0200, Oleg Nesterov wrote:
>
> OK, cgroups-add-ability-to-move-all-threads-in-a-process-to-a-new-cgroup-atomically.patch
> does
>
> threadgroup_sighand = threadgroup_fork_lock(leader);
>
> rcu_read_lock();
> list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group)
>
> and this is equally wrong afais. Hmm, and other similar
> list_for_each_entry_rcu's doesn't look right. This code can do something
> like
>
> rcu_read_lock();
> if (!tsk->sighand) // tsk is leader or not, doesn't matter
> fail;
> list_for_each_rcu(tsk->thread_group) {}
> rcu_read_unlock();
>
> though.
>
>
>
> And why do we need sighand->threadgroup_fork_lock ? I gueess, to protect
> against clone(CLONE_THREAD).
>
> But. threadgroup_fork_lock() has another problem. Even if the process
> P is singlethreaded, I can't see how ->threadgroup_fork_lock work.
>
> threadgroup_fork_lock() bumps P->sighand->count. If P exec, it will
> notice sighand->count != 1 and switch to another ->sighand.
>
> Oleg.
So how about this: Each time we take tasklist_lock to iterate over
thread_group (once in getting the sighand, once to move all the tasks),
check if we raced with exec. The two problems are 1) group_leader
changes - we'll need to find the new leader's task_struct anyway - means
we can't iterate over thread_group, and 2) sighand changes after we take
the old one, means we've taken a useless lock.
I put together draft revisions of the old patches that check for racing
with exec in both cases, and if so, returning EAGAIN. I have the wrapper
function cgroup_procs_write loop around the return value, but it could
also possibly give EAGAIN back to userspace. Hopefully the code is safe
and sane this time :)
-- bblum
---
Documentation/cgroups/cgroups.txt | 7
include/linux/cgroup.h | 14 -
include/linux/init_task.h | 9
include/linux/sched.h | 15 +
kernel/cgroup.c | 519 ++++++++++++++++++++++++++++++++++----
kernel/fork.c | 9
6 files changed, 524 insertions(+), 49 deletions(-)
Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
From: Ben Blum <[email protected]>
This patch adds an rwsem that lives in a threadgroup's sighand_struct (next to
the sighand's atomic count, to piggyback on its cacheline), and two functions
in kernel/cgroup.c (for now) for easily+safely obtaining and releasing it. If
another part of the kernel later wants to use such a locking mechanism, the
CONFIG_CGROUPS ifdefs should be changed to a higher-up flag that CGROUPS and
the other system would both depend on, and the lock/unlock functions could be
moved to sched.c or so.
This is a pre-patch for cgroups-procs-write.patch.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/cgroup.h | 14 +++++--
include/linux/init_task.h | 9 ++++
include/linux/sched.h | 15 +++++++
kernel/cgroup.c | 93 ++++++++++++++++++++++++++++++++++++++++++++-
kernel/fork.c | 9 +++-
5 files changed, 131 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9be4c22..2eb54bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,10 +30,12 @@ extern int cgroup_init(void);
extern void cgroup_lock(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
-extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork(struct task_struct *p, unsigned long clone_flags);
extern void cgroup_fork_callbacks(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern void cgroup_post_fork(struct task_struct *p, unsigned long clone_flags);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks,
+ unsigned long clone_flags);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
extern int cgroup_load_subsys(struct cgroup_subsys *ss);
@@ -580,10 +582,14 @@ unsigned short css_depth(struct cgroup_subsys_state *css);
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
-static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork(struct task_struct *p,
+ unsigned long clone_flags) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+ unsigned long clone_flags) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_fork_failed(struct task_struct *p, int callbacks,
+ unsigned long clone_flags) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8ed0abf..aaa4b9c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -41,7 +41,16 @@ extern struct nsproxy init_nsproxy;
INIT_IPC_NS(ipc_ns) \
}
+#ifdef CONFIG_CGROUPS
+# define INIT_THREADGROUP_FORK_LOCK(sighand) \
+ .threadgroup_fork_lock = \
+ __RWSEM_INITIALIZER(sighand.threadgroup_fork_lock),
+#else
+# define INIT_THREADGROUP_FORK_LOCK(sighand)
+#endif
+
#define INIT_SIGHAND(sighand) { \
+ INIT_THREADGROUP_FORK_LOCK(sighand) \
.count = ATOMIC_INIT(1), \
.action = { { { .sa_handler = NULL, } }, }, \
.siglock = __SPIN_LOCK_UNLOCKED(sighand.siglock), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23b26c7..10a22a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -475,6 +475,21 @@ extern int get_dumpable(struct mm_struct *mm);
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
struct sighand_struct {
+#ifdef CONFIG_CGROUPS
+ /*
+ * The threadgroup_fork_lock is used to prevent any threads in a
+ * threadgroup from forking with CLONE_THREAD while held for writing,
+ * used for threadgroup-wide operations that are fork-sensitive. It
+ * lives here next to sighand.count as a cacheline optimization.
+ *
+ * TODO: if anybody besides cgroups uses this lock, change the
+ * CONFIG_CGROUPS to a higher-up CONFIG_* that the other user and
+ * cgroups would both depend upon. Also, they'll want to move where
+ * the readlock happens - it currently lives in kernel/cgroup.c in
+ * cgroup_{fork,post_fork,fork_failed}().
+ */
+ struct rw_semaphore threadgroup_fork_lock;
+#endif
atomic_t count;
struct k_sigaction action[_NSIG];
spinlock_t siglock;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cc2e1f6..99782a0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1623,6 +1623,71 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
/**
+ * threadgroup_fork_lock - block all CLONE_THREAD forks in the threadgroup
+ * @tsk: the task whose threadgroup should be locked
+ *
+ * Takes the threadgroup_lock_mutex in the threadgroup's sighand_struct, by
+ * means of searching the threadgroup list for a live thread in the group.
+ * Returns the sighand_struct that should be given to threadgroup_fork_unlock,
+ * or -ESRCH if all threads in the group are exiting and have cleared their
+ * sighand pointers, or -EAGAIN if tsk is not the threadgroup leader.
+ */
+struct sighand_struct *threadgroup_fork_lock(struct task_struct *tsk)
+{
+ struct sighand_struct *sighand;
+ struct task_struct *p;
+
+ /* tasklist lock protects sighand_struct's disappearance in exit(). */
+ read_lock(&tasklist_lock);
+
+ /* make sure the threadgroup's state is sane before we proceed */
+ if (unlikely(!thread_group_leader(tsk))) {
+ /* a race with de_thread() stripped us of our leadership */
+ read_unlock(&tasklist_lock);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* now try to find a sighand */
+ if (likely(tsk->sighand)) {
+ sighand = tsk->sighand;
+ } else {
+ sighand = ERR_PTR(-ESRCH);
+ /*
+ * tsk is exiting; try to find another thread in the group
+ * whose sighand pointer is still alive.
+ */
+ list_for_each_entry_rcu(p, &tsk->thread_group, thread_group) {
+ if (p->sighand) {
+ sighand = tsk->sighand;
+ break;
+ }
+ }
+ }
+ /* prevent sighand from vanishing before we let go of tasklist_lock */
+ if (likely(sighand))
+ atomic_inc(&sighand->count);
+
+ /* done searching. */
+ read_unlock(&tasklist_lock);
+
+ if (likely(sighand))
+ down_write(&sighand->threadgroup_fork_lock);
+ return sighand;
+}
+
+/**
+ * threadgroup_fork_lock - let threadgroup resume CLONE_THREAD forks.
+ * @sighand: the threadgroup's sighand that threadgroup_fork_lock gave back
+ *
+ * Lets go of the threadgroup_fork_lock, and drops the sighand reference.
+ */
+void threadgroup_fork_unlock(struct sighand_struct *sighand)
+{
+ up_write(&sighand->threadgroup_fork_lock);
+ __cleanup_sighand(sighand);
+}
+
+/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
* @tsk: the task to be attached
@@ -3713,8 +3778,10 @@ static const struct file_operations proc_cgroupstats_operations = {
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
-void cgroup_fork(struct task_struct *child)
+void cgroup_fork(struct task_struct *child, unsigned long clone_flags)
{
+ if (clone_flags & CLONE_THREAD)
+ down_read(¤t->sighand->threadgroup_fork_lock);
task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
@@ -3756,7 +3823,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
* with the first call to cgroup_iter_start() - to guarantee that the
* new task ends up on its list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child, unsigned long clone_flags)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
@@ -3766,6 +3833,8 @@ void cgroup_post_fork(struct task_struct *child)
task_unlock(child);
write_unlock(&css_set_lock);
}
+ if (clone_flags & CLONE_THREAD)
+ up_read(¤t->sighand->threadgroup_fork_lock);
}
/**
* cgroup_exit - detach cgroup from exiting task
@@ -3841,6 +3910,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
/**
+ * cgroup_fork_failed - undo operations for fork failure
+ * @tsk: pointer to task_struct of exiting process
+ * @run_callback: run exit callbacks?
+ *
+ * Description: Undo cgroup operations after cgroup_fork in fork failure.
+ *
+ * We release the read lock that was taken in cgroup_fork(), since it is
+ * supposed to be dropped in cgroup_post_fork in the success case. The other
+ * thing that wants to be done is detaching the failed child task from the
+ * cgroup, so we wrap cgroup_exit.
+ */
+void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks,
+ unsigned long clone_flags)
+{
+ if (clone_flags & CLONE_THREAD)
+ up_read(¤t->sighand->threadgroup_fork_lock);
+ cgroup_exit(tsk, run_callbacks);
+}
+
+/**
* cgroup_clone - clone the cgroup the given subsystem is attached to
* @tsk: the task to be moved
* @subsys: the given subsystem
diff --git a/kernel/fork.c b/kernel/fork.c
index 404e6ca..daf5967 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -809,6 +809,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
return -ENOMEM;
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
return 0;
}
@@ -1091,7 +1094,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
- cgroup_fork(p);
+ cgroup_fork(p, clone_flags);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
@@ -1316,7 +1319,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, clone_flags);
perf_event_fork(p);
return p;
@@ -1350,7 +1353,7 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_fork_failed(p, cgroup_callbacks_done, clone_flags);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
Makes procs file writable to move all threads by tgid at once
From: Ben Blum <[email protected]>
This patch adds functionality that enables users to move all threads in a
threadgroup at once to a cgroup by writing the tgid to the 'cgroup.procs'
file. This current implementation makes use of a per-threadgroup rwsem that's
taken for reading in the fork() path to prevent newly forking threads within
the threadgroup from "escaping" while the move is in progress.
Signed-off-by: Ben Blum <[email protected]>
---
Documentation/cgroups/cgroups.txt | 7 +
kernel/cgroup.c | 426 ++++++++++++++++++++++++++++++++++---
2 files changed, 393 insertions(+), 40 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 7527bac..a5f1e6a 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -9,6 +9,7 @@ Portions Copyright (C) 2004 BULL SA.
Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
Modified by Paul Jackson <[email protected]>
Modified by Christoph Lameter <[email protected]>
+Modified by Ben Blum <[email protected]>
CONTENTS:
=========
@@ -415,6 +416,12 @@ You can attach the current shell task by echoing 0:
# echo 0 > tasks
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the pid of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+be attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
2.3 Mounting hierarchies by name
--------------------------------
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 99782a0..f79d70b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1622,6 +1622,87 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
return 0;
}
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail
+ * with -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, int guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /*
+ * locate or allocate a new css_set for this task. 'guarantee' tells
+ * us whether or not we are sure that a new css_set already exists;
+ * in that case, we are not allowed to fail or sleep, as we won't need
+ * malloc.
+ */
+ if (guarantee) {
+ /*
+ * our caller promises us that the css_set we want already
+ * exists, so we use find_existing_css_set directly.
+ */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /*
+ * we cannot move a task that's declared itself as exiting, as once
+ * PF_EXITING is set, the tsk->cgroups pointer is no longer safe.
+ */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to
+ * drop it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* threadgroup_fork_lock - block all CLONE_THREAD forks in the threadgroup
* @tsk: the task whose threadgroup should be locked
@@ -1697,11 +1778,9 @@ void threadgroup_fork_unlock(struct sighand_struct *sighand)
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1717,75 +1796,326 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 0);
+ if (retval)
+ return retval;
+
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ }
+
+ synchronize_rcu();
+
/*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
+ * wake up rmdir() waiter. the rmdir should fail since the cgroup
+ * is no longer empty.
*/
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ return 0;
+}
+
+/*
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list, of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving
+ * the given task to the given cgroup. Returns 0 on success, -ENOMEM if we
+ * run out of memory.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ /* ensure a new css_set will exist for this thread */
newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
if (!newcg)
return -ENOMEM;
+ /* add new element to list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex. Will take task_lock of each thread in leader's
+ * threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval;
+ struct cgroup_subsys *ss;
+ struct cgroup *oldcgrp;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor */
+ struct task_struct *tsk;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry;
+ /* needed for locking the threadgroup */
+ struct sighand_struct *threadgroup_sighand;
+
+ /*
+ * because of possible races with de_thread() we can't distinguish
+ * between the case where the user gives a non-leader tid and the case
+ * where it changes out from under us.
+ */
+ leader = leader->group_leader;
+
+ /*
+ * check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader, true);
+ if (retval)
+ return retval;
+ }
+ }
+
+ /*
+ * step 1: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ oldcgrp = task_cgroup_from_root(leader, root);
+ if (cgrp != oldcgrp) {
+ /* get old css_set */
+ task_lock(leader);
+ if (leader->flags & PF_EXITING) {
+ task_unlock(leader);
+ goto prefetch_loop;
+ }
+ oldcg = leader->cgroups;
+ get_css_set(oldcg);
+ task_unlock(leader);
+ /* acquire new one */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ }
+prefetch_loop:
+ rcu_read_lock();
+ /*
+ * if we need to fetch a new css_set for this task, we must exit the
+ * rcu_read section because allocating it can sleep. afterwards, we'll
+ * need to restart iteration on the threadgroup list - the whole thing
+ * will be O(nm) in the number of threads and css_sets; as the typical
+ * case only has one css_set for all of them, usually O(n).
+ */
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ rcu_read_unlock();
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto list_teardown;
+ /* begin iteration again. */
+ goto prefetch_loop;
+ }
}
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
+ rcu_read_unlock();
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
+ /*
+ * step 2: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup. Even if the threadgroup leader
+ * is PF_EXITING, we still proceed to move all of its sub-threads to
+ * the new cgroup; if everybody is PF_EXITING, we'll just end up doing
+ * nothing, which is ok.
+ */
+ oldcgrp = task_cgroup_from_root(leader, root);
+ /* if leader is already there, skip moving him */
+ if (cgrp != oldcgrp) {
+ retval = cgroup_task_migrate(cgrp, oldcgrp, leader, 1);
+ BUG_ON(retval != 0 && retval != -ESRCH);
}
- write_unlock(&css_set_lock);
+ /*
+ * now move all the rest of the threads - need to lock against
+ * possible races with fork(). (Remember, the sighand's lock needs
+ * to be taken outside of tasklist_lock.)
+ */
+ threadgroup_sighand = threadgroup_fork_lock(leader);
+ if (unlikely(IS_ERR(threadgroup_sighand))) {
+ /*
+ * this happens with either ESRCH or EAGAIN; either way, the
+ * calling function takes care of it.
+ */
+ retval = PTR_ERR(threadgroup_sighand);
+ goto list_teardown;
+ }
+ read_lock(&tasklist_lock);
+ /*
+ * Finally, before we can continue, make sure the threadgroup is sane.
+ * First, if de_thread() changed the leader, then no guarantees on the
+ * safety of iterating leader->thread_group. Second, regardless of
+ * leader, de_thread() can change the sighand since we grabbed a
+ * reference on it. Either case is a race with exec() and therefore
+ * not safe to proceed.
+ */
+ if (!thread_group_leader(leader) ||
+ (leader->sighand && leader->sighand != threadgroup_sighand)) {
+ retval = -EAGAIN;
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_unlock(threadgroup_sighand);
+ goto list_teardown;
+ }
+
+ list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* we don't care whether these threads are exiting */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, 1);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+
+ /*
+ * step 3: attach whole threadgroup to each subsystem
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that
+ * information.
+ */
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk, true);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
- synchronize_rcu();
- put_css_set(cg);
+
+ /* holding these until here keeps us safe from exec() and fork(). */
+ read_unlock(&tasklist_lock);
+ threadgroup_fork_unlock(threadgroup_sighand);
/*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
+ * step 4: success! ...and cleanup
*/
+ synchronize_rcu();
cgroup_wakeup_rmdir_waiter(cgrp);
- return 0;
+ retval = 0;
+list_teardown:
+ /* no longer need the list of css_sets, so get rid of it */
+ while (!list_empty(&newcg_list)) {
+ /* pop from the list */
+ cg_entry = list_first_entry(&newcg_list, struct cg_list_entry,
+ links);
+ list_del(&cg_entry->links);
+ /* drop the refcount */
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+ /* done! */
+ return retval;
}
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid,
+ int attach(struct cgroup *,
+ struct task_struct *))
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (!tsk || tsk->flags & PF_EXITING) {
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
-
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on the group leader, because
+ * even if another task has different permissions, the group
+ * leader will have sufficient access to change it.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
@@ -1795,18 +2125,34 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ /*
+ * Note that the check for whether the task is its threadgroup leader
+ * is done in cgroup_attach_proc. This means that writing 0 to the
+ * procs file will only work if the writing task is the leader.
+ */
+ ret = attach(cgrp, tsk);
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, cgroup_attach_task);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * Nobody lower than us can handle the EAGAIN, since if a race
+ * with de_thread() changes the group leader, the task_struct
+ * matching the given tgid will have changed, and we'll need
+ * to find it again.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, cgroup_attach_proc);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -2966,9 +3312,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
On 01/03, Ben Blum wrote:
>
> Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
I didn't actually read this series, but at first glance it still has
problems...
> +struct sighand_struct *threadgroup_fork_lock(struct task_struct *tsk)
static?
> +{
> + struct sighand_struct *sighand;
> + struct task_struct *p;
> +
> + /* tasklist lock protects sighand_struct's disappearance in exit(). */
> + read_lock(&tasklist_lock);
> +
> + /* make sure the threadgroup's state is sane before we proceed */
> + if (unlikely(!thread_group_leader(tsk))) {
> + /* a race with de_thread() stripped us of our leadership */
> + read_unlock(&tasklist_lock);
> + return ERR_PTR(-EAGAIN);
I don't understand how this can close the race with de_thread().
Suppose this tsk is the new leader, after de_thread() changed ->group_leader
and dropped tasklist_lock.
threadgroup_fork_lock() bumps sighand->count
de_thread() continues, notices oldsighand->count != 1 and switches
to the new ->sighand.
After that tsk can spawn other threads, but cgroup_fork() will use
newsighand->threadgroup_fork_lock while cgroup_attach_proc() relies
on oldsighand->threadgroup_fork_lock.
> + /* now try to find a sighand */
> + if (likely(tsk->sighand)) {
> + sighand = tsk->sighand;
> + } else {
> + sighand = ERR_PTR(-ESRCH);
> + /*
> + * tsk is exiting; try to find another thread in the group
> + * whose sighand pointer is still alive.
> + */
> + list_for_each_entry_rcu(p, &tsk->thread_group, thread_group) {
> + if (p->sighand) {
> + sighand = tsk->sighand;
can't understand this "else {}" code... We hold tasklist, if the group
leader is dead (->sighand == NULL), then the whole thread group is
dead.
Even if we had another thread with ->sighand != NULL, what is the point
of "if (unlikely(!thread_group_leader(tsk)))" check then?
Oleg.
On Tue, Jan 05, 2010 at 07:53:30PM +0100, Oleg Nesterov wrote:
> On 01/03, Ben Blum wrote:
> >
> > Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup
>
> I didn't actually read this series, but at first glance it still has
> problems...
>
> > +struct sighand_struct *threadgroup_fork_lock(struct task_struct *tsk)
>
> static?
sure, though in the end this is perhaps not the best place for the
function anyway. in fact, this function only does half of the job, so
a good amount of refactoring might be in order.
>
> > +{
> > + struct sighand_struct *sighand;
> > + struct task_struct *p;
> > +
> > + /* tasklist lock protects sighand_struct's disappearance in exit(). */
> > + read_lock(&tasklist_lock);
> > +
> > + /* make sure the threadgroup's state is sane before we proceed */
> > + if (unlikely(!thread_group_leader(tsk))) {
> > + /* a race with de_thread() stripped us of our leadership */
> > + read_unlock(&tasklist_lock);
> > + return ERR_PTR(-EAGAIN);
>
> I don't understand how this can close the race with de_thread().
>
> Suppose this tsk is the new leader, after de_thread() changed ->group_leader
> and dropped tasklist_lock.
>
> threadgroup_fork_lock() bumps sighand->count
>
> de_thread() continues, notices oldsighand->count != 1 and switches
> to the new ->sighand.
>
> After that tsk can spawn other threads, but cgroup_fork() will use
> newsighand->threadgroup_fork_lock while cgroup_attach_proc() relies
> on oldsighand->threadgroup_fork_lock.
the race with the sighand is handled in the next patch, in attach_proc,
not in this function. this check is just to make sure that the list is
safe to iterate over, since de_thread changing group leadership could
ruin that. so in the end, there are two places where EAGAIN can happen -
one here, and one later (in the second patch).
>
> > + /* now try to find a sighand */
> > + if (likely(tsk->sighand)) {
> > + sighand = tsk->sighand;
> > + } else {
> > + sighand = ERR_PTR(-ESRCH);
> > + /*
> > + * tsk is exiting; try to find another thread in the group
> > + * whose sighand pointer is still alive.
> > + */
> > + list_for_each_entry_rcu(p, &tsk->thread_group, thread_group) {
> > + if (p->sighand) {
> > + sighand = tsk->sighand;
>
> can't understand this "else {}" code... We hold tasklist, if the group
> leader is dead (->sighand == NULL), then the whole thread group is
> dead.
>
> Even if we had another thread with ->sighand != NULL, what is the point
> of "if (unlikely(!thread_group_leader(tsk)))" check then?
doesn't the group leader stay on the threadgroup list even when it dies?
sighand can be null if the group leader has exited, but other threads
are still running.
>
> Oleg.
>
hope that makes more sense. I'd like to have the code between these two
patches refactored, but first want to make sure it's correct.
-- bblum
On Mon, May 31, 2010 at 11:04 AM, Oleg Nesterov <[email protected]> wrote:
>
> And, forgot to mention, I do not understand the PF_EXITING check in
> attach_task_by_pid() (and some others).
>
> At first glance, it buys nothing. PF_EXITING can be set right after
> the check.
It can, but it's a benign race.
Moving a non-current thread into a cgroup takes task->alloc_lock and
checks for PF_EXITING before manipulating that thread's cgroup links.
The exit procedure sets PF_EXITING and then (somewhat later, but
guaranteed) moves current to the root cgroups while holding
alloc_lock. If a task hasn't set PF_EXITING by the time we check for
it, while holding alloc_lock, it can't enter the cgroup cleanup code
until we've finished moving it to its new cgroup. If it has set
PF_EXITING by that point, it's guaranteed to be moving itself to the
root cgroups and disconnecting from various cgroups structures in the
very near future, so it's fine to refuse to move it to a new cgroup.
Paul
On 06/01, Paul Menage wrote:
>
> On Mon, May 31, 2010 at 11:04 AM, Oleg Nesterov <[email protected]> wrote:
> >
> > And, forgot to mention, I do not understand the PF_EXITING check in
> > attach_task_by_pid() (and some others).
> >
> > At first glance, it buys nothing. PF_EXITING can be set right after
> > the check.
>
> It can, but it's a benign race.
Yes,
> Moving a non-current thread into a cgroup takes task->alloc_lock and
> checks for PF_EXITING before manipulating that thread's cgroup links.
> The exit procedure sets PF_EXITING and then (somewhat later, but
> guaranteed) moves current to the root cgroups while holding
> alloc_lock.
Yes sure, I understand this part. cgroup_attach_task() correctly checks
PF_EXITING under task_lock(), this protects from the case when this
task has already passed cgroup_exit() which takes this lock too.
But. This exactly means that the PF_EXITING check in attach_task_by_pid()
is not necessary from the correctness pov (while probably can be considered
as optimization), right?
And,
> so it's fine to refuse to move it to a new cgroup.
I am not sure. It doesn't hurt when we try to move a thread. But if
we want to move the whole thread group, we should proceed even if the
group leader has already exited and thus has PF_EXITING bit set.
That was my question.
Oleg.
On Wed, Jun 2, 2010 at 7:06 AM, Oleg Nesterov <[email protected]> wrote:
>
> Yes sure, I understand this part. cgroup_attach_task() correctly checks
> PF_EXITING under task_lock(), this protects from the case when this
> task has already passed cgroup_exit() which takes this lock too.
Right.
>
> But. This exactly means that the PF_EXITING check in attach_task_by_pid()
> is not necessary from the correctness pov (while probably can be considered
> as optimization), right?
For the value of correctness that was relevant when writing that code
originally, I think it's fine. But ...
> I am not sure. It doesn't hurt when we try to move a thread. But if
> we want to move the whole thread group, we should proceed even if the
> group leader has already exited and thus has PF_EXITING bit set.
Hmm, maybe. I could see this being argued both ways. Can the process
hang around indefinitely with the leader as a zombie and the other
threads still running?
It wouldn't be that hard to make it possible to avoid relying on
PF_EXITING as the check - instead we'd have an exiting_css_set object
that have the same pointer set as init_css_set, but would be
distinguishable from it as a task->cgroups pointer by virtue of being
a different object. Then cgroup_exit() can reassign tasks to point to
exiting_css_set rather than init_css_set, and the various checks that
are currently made for (task->flags & PF_EXITING) could check for
(task->cgroups == exiting_css_set) instead. This would allow task
movement further into the exit process.
Paul
On 06/02, Paul Menage wrote:
>
> On Wed, Jun 2, 2010 at 7:06 AM, Oleg Nesterov <[email protected]> wrote:
> >
> > I am not sure. It doesn't hurt when we try to move a thread. But if
> > we want to move the whole thread group, we should proceed even if the
> > group leader has already exited and thus has PF_EXITING bit set.
>
> Hmm, maybe. I could see this being argued both ways. Can the process
> hang around indefinitely with the leader as a zombie and the other
> threads still running?
Yes sure. The main thread can exit via sys_exit(), this doesn't affect
the thread group. Of course, I am not saying this is common practice,
perhaps no "important" app does this.
> It wouldn't be that hard to make it possible to avoid relying on
> PF_EXITING as the check - instead we'd have an exiting_css_set object
> that have the same pointer set as init_css_set, but would be
> distinguishable from it as a task->cgroups pointer by virtue of being
> a different object. Then cgroup_exit() can reassign tasks to point to
> exiting_css_set rather than init_css_set, and the various checks that
> are currently made for (task->flags & PF_EXITING) could check for
> (task->cgroups == exiting_css_set) instead. This would allow task
> movement further into the exit process.
It is too late for me to even try to understand the above ;)
But I still can't understand why we can't just remove it. Both
cgroup_attach_task() and cgroup_attach_proc() should handle the
possible races correctly anyway.
Oleg.
On Wed, Jun 2, 2010 at 1:20 PM, Oleg Nesterov <[email protected]> wrote:
>
> Yes sure. The main thread can exit via sys_exit(), this doesn't affect
> the thread group. Of course, I am not saying this is common practice,
> perhaps no "important" app does this.
This check has been in cgroups for quite a while and no-one's
complained so far...
> But I still can't understand why we can't just remove it. Both
> cgroup_attach_task() and cgroup_attach_proc() should handle the
> possible races correctly anyway.
The "it" that you're proposing to remove is in fact the code that
handles those races.
Anyway, I think this issue is orthogonal to the movability of entire
processes - with Ben's patch, an exited-but-not-reaped group leader is
immovable either via "tasks" or "cgroup.procs". Changing the race
condition handling to allow such movement would be a separate issue.
Paul
Paul
On 06/02, Paul Menage wrote:
>
> On Wed, Jun 2, 2010 at 1:20 PM, Oleg Nesterov <[email protected]> wrote:
> >
> > Yes sure. The main thread can exit via sys_exit(), this doesn't affect
> > the thread group. Of course, I am not saying this is common practice,
> > perhaps no "important" app does this.
>
> This check has been in cgroups for quite a while and no-one's
> complained so far...
Sure. because currently attach_task_by_pid() works with the single
thread. But Ben's patch reuses this code to call cgroup_attach_proc().
> > But I still can't understand why we can't just remove it. Both
> > cgroup_attach_task() and cgroup_attach_proc() should handle the
> > possible races correctly anyway.
>
> The "it" that you're proposing to remove is in fact the code that
> handles those races.
In that case I confused, and I thought we already agreed that
the PF_EXITING check in attach_task_by_pid() is not strictly needed
for correctness.
Once again, the task can call do_exit() and set PF_EXITING right
after the check.
> Anyway, I think this issue is orthogonal to the movability of entire
> processes - with Ben's patch, an exited-but-not-reaped group leader is
> immovable either via "tasks" or "cgroup.procs".
Hmm. As I said, I didn't really read this patch. But I thought that
cgroup_attach_proc() tries to handle this.
Anyway I agree, this is minor and I never pretended I understand
this code.
Hmm. The usage of ->thread_group in ->can_attach() methods doesn't
look safe to me... but currently bool threadgroup is always false.
Oleg.
On Wed, Jun 2, 2010 at 1:58 PM, Oleg Nesterov <[email protected]> wrote:
>> The "it" that you're proposing to remove is in fact the code that
>> handles those races.
>
> In that case I confused, and I thought we already agreed that
> the PF_EXITING check in attach_task_by_pid() is not strictly needed
> for correctness.
Not quite - something is required for correctness, and the PF_EXITING
check provides that correctness, with a very small window (between
setting PF_EXITING and calling cgroup_exit) where we might arguably
have been able to move the thread but decline to do so because it's
simpler not to do so and no-one cares. That's the optimization that I
meant - the data structures are slightly simpler since there's no way
to tell when a task has passed cgroup_exit(), and instead we just see
if they've passed PF_EXITING.
>
> Once again, the task can call do_exit() and set PF_EXITING right
> after the check.
Yes, the important part is that they haven't set it *before* the check
in attach_task_by_pid(). If they have set it before that, then they
could be anywhere in the exit path after PF_EXITING, and we decline to
move them since it's possible that they've already passed
cgroup_exit(). If the exiting task has not yet set PF_EXITING, then it
can't possibly get into the critical section in cgroup_exit() since
attach_task_by_pid() holds task->alloc_lock.
Paul
On 06/02, Paul Menage wrote:
>
> On Wed, Jun 2, 2010 at 1:58 PM, Oleg Nesterov <[email protected]> wrote:
> >> The "it" that you're proposing to remove is in fact the code that
> >> handles those races.
> >
> > In that case I confused, and I thought we already agreed that
> > the PF_EXITING check in attach_task_by_pid() is not strictly needed
> > for correctness.
>
> Not quite - something is required for correctness, and the PF_EXITING
> check provides that correctness, with a very small window (between
> setting PF_EXITING and calling cgroup_exit) where we might arguably
> have been able to move the thread but decline to do so because it's
> simpler not to do so and no-one cares. That's the optimization that I
> meant - the data structures are slightly simpler since there's no way
> to tell when a task has passed cgroup_exit(), and instead we just see
> if they've passed PF_EXITING.
>
> >
> > Once again, the task can call do_exit() and set PF_EXITING right
> > after the check.
>
> Yes, the important part is that they haven't set it *before* the check
> in attach_task_by_pid(). If they have set it before that, then they
> could be anywhere in the exit path after PF_EXITING, and we decline to
> move them since it's possible that they've already passed
> cgroup_exit(). If the exiting task has not yet set PF_EXITING, then it
> can't possibly get into the critical section in cgroup_exit() since
> attach_task_by_pid() holds task->alloc_lock.
It doesn't ? At least in Linus's tree.
cgroup_attach_task() does, and this time PF_EXITING is understandable.
Oleg.
On Wed, Jun 2, 2010 at 2:38 PM, Oleg Nesterov <[email protected]> wrote:
>
> cgroup_attach_task() does, and this time PF_EXITING is understandable.
>
Oh, OK, I was talking about the one in cgroup_attach_task(), which is
called by attach_task_by_pid(), and assumed that you were referring to
the same one. I'd forgotten about the pre-check in
attach_task_by_pid(). Yes, that one could probably be removed without
affecting any final outcomes.
Paul
On Wed, Jun 02, 2010 at 10:58:55PM +0200, Oleg Nesterov wrote:
> Hmm. The usage of ->thread_group in ->can_attach() methods doesn't
> look safe to me... but currently bool threadgroup is always false.
>
> Oleg.
>
>
I recall putting a rcu_read_lock() around that part and being assured
that made it safe. But I don't remember the details. Maybe taking
tasklist_lock is necessary?
-- Ben
On Wed, Jun 02, 2010 at 03:03:49PM -0700, Paul Menage wrote:
> On Wed, Jun 2, 2010 at 2:38 PM, Oleg Nesterov <[email protected]> wrote:
> >
> > cgroup_attach_task() does, and this time PF_EXITING is understandable.
> >
>
> Oh, OK, I was talking about the one in cgroup_attach_task(), which is
> called by attach_task_by_pid(), and assumed that you were referring to
> the same one. I'd forgotten about the pre-check in
> attach_task_by_pid(). Yes, that one could probably be removed without
> affecting any final outcomes.
>
> Paul
Yes, I agree. The check should be moved into cgroup_attach_task before
the ss->can_attach calls, so the "optimization" stays in the case where
it's wanted.
And the use of __task_cred after the exiting check seems safe too if the
check is gone - from cred.h: "The caller must make sure task doesn't go
away, either by holding a ref on task or by holding tasklist_lock to
prevent it from being unlinked." So that means it'd be safe to take a
reference, task sets exiting and does everything before unhashing, then
access the cred pointer; since we hold tasklist lock it's the same case.
-- Ben
On Mon, May 31, 2010 at 07:52:42PM +0200, Oleg Nesterov wrote:
> I only glanced into one function, cgroup_attach_proc(), and some things
> look "obviously wrong". Sorry, I can't really read these patches now,
> most probably I misunderstood the code...
>
> > +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
> > +{
> > + int retval;
> > + struct cgroup_subsys *ss, *failed_ss = NULL;
> > + struct cgroup *oldcgrp;
> > + struct css_set *oldcg;
> > + struct cgroupfs_root *root = cgrp->root;
> > + /* threadgroup list cursor */
> > + struct task_struct *tsk;
> > + /*
> > + * we need to make sure we have css_sets for all the tasks we're
> > + * going to move -before- we actually start moving them, so that in
> > + * case we get an ENOMEM we can bail out before making any changes.
> > + */
> > + struct list_head newcg_list;
> > + struct cg_list_entry *cg_entry, *temp_nobe;
> > +
> > + /*
> > + * Note: Because of possible races with de_thread(), we can't
> > + * distinguish between the case where the user gives a non-leader tid
> > + * and the case where it changes out from under us. So both are allowed.
> > + */
>
> OK, the caller has a reference to the argument, leader,
>
> > + leader = leader->group_leader;
>
> But why it is safe to use leader->group_leader if we race with exec?
This line means "let's try to find who the leader is", since
attach_task_by_pid doesn't grab it for us. It's not "safe", and we still
check if it's really the leader later (just before the 'commit point').
Note that before this line 'leader' doesn't really mean the leader -
perhaps i should rename the variables :P
But maybe I also want to grab a reference on the new task? I can't
remember whether I need to or not. I'm not sure whether or not I need to
grab an rcu lock, but it doesn't seem necessary because of the commit
point check later on. Plus can_attach takes the rcu lock itself for
iterating if it needs it.
>
> > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
>
> Even if we didn't change "leader" above, this is not safe in theory.
> We already discussed this, list_for_each_rcu(head) is only safe when
> we know that "head" itself is valid.
>
> Suppose that this leader exits, then leader->thread_group.next exits
> too before we take rcu_read_lock().
Why is that a problem? I thought leader->thread_group is supposed to
stay sane as long as leader is the leader.
This looks like it needs a check to see if 'leader' is still really the
leader, but nothing more.
>
> > + oldcgrp = task_cgroup_from_root(leader, root);
> > + if (cgrp != oldcgrp) {
> > + retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
> > + BUG_ON(retval != 0 && retval != -ESRCH);
> > + }
> > + /* Now iterate over each thread in the group. */
> > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> > + BUG_ON(tsk->signal != leader->signal);
> > + /* leave current thread as it is if it's already there */
> > + oldcgrp = task_cgroup_from_root(tsk, root);
> > + if (cgrp == oldcgrp)
> > + continue;
> > + /* we don't care whether these threads are exiting */
> > + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
> > + BUG_ON(retval != 0 && retval != -ESRCH);
> > + }
>
> This looks strange. Why do we move leader outside of the loop ?
> Of course, list_for_each_entry() can't work to move all sub-threads,
> but "do while_each_thread()" can.
do/while_each_thread oves over all threads in the system, rather than
just the threadgroup... this isn't supposed to be a fast operation, but
that seems like overkill.
>
> From 0/2:
> >
> > recentish changes to signal_struct's lifetime rules (which don't seem to
> > appear when I check out mmotm with git clone,
>
> already in Linus's tree.
>
> Oleg.
>
-- Ben
On 06/03, Ben Blum wrote:
>
> On Mon, May 31, 2010 at 07:52:42PM +0200, Oleg Nesterov wrote:
> >
> > OK, the caller has a reference to the argument, leader,
> >
> > > + leader = leader->group_leader;
> >
> > But why it is safe to use leader->group_leader if we race with exec?
>
> This line means "let's try to find who the leader is", since
> attach_task_by_pid doesn't grab it for us. It's not "safe", and we still
> check if it's really the leader later (just before the 'commit point').
It is not safe to even dereference this memory, it can point to nowhere.
I do not remember how this patch does "check if it's really the leader later",
but in any case this is too late: iirc at least can_attach(leader) was called.
> Note that before this line 'leader' doesn't really mean the leader -
Yes,
> perhaps i should rename the variables :P
>
> But maybe I also want to grab a reference on the new task?
Of course.
Not that I really understand why this task must be ->group_leader at
this point.
> > > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> >
> > Even if we didn't change "leader" above, this is not safe in theory.
> > We already discussed this, list_for_each_rcu(head) is only safe when
> > we know that "head" itself is valid.
> >
> > Suppose that this leader exits, then leader->thread_group.next exits
> > too before we take rcu_read_lock().
>
> Why is that a problem? I thought leader->thread_group is supposed to
> stay sane as long as leader is the leader.
Unless we race with exec/exit. Yes, this race is very unlikely.
> This looks like it needs a check to see if 'leader' is still really the
> leader, but nothing more.
Or you can check pid_alive(). Again, you don't really need ->group_leader
to iterate over thread-group.
> > > + oldcgrp = task_cgroup_from_root(leader, root);
> > > + if (cgrp != oldcgrp) {
> > > + retval = cgroup_task_migrate(cgrp, oldcgrp, leader, true);
> > > + BUG_ON(retval != 0 && retval != -ESRCH);
> > > + }
> > > + /* Now iterate over each thread in the group. */
> > > + list_for_each_entry_rcu(tsk, &leader->thread_group, thread_group) {
> > > + BUG_ON(tsk->signal != leader->signal);
> > > + /* leave current thread as it is if it's already there */
> > > + oldcgrp = task_cgroup_from_root(tsk, root);
> > > + if (cgrp == oldcgrp)
> > > + continue;
> > > + /* we don't care whether these threads are exiting */
> > > + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
> > > + BUG_ON(retval != 0 && retval != -ESRCH);
> > > + }
> >
> > This looks strange. Why do we move leader outside of the loop ?
> > Of course, list_for_each_entry() can't work to move all sub-threads,
> > but "do while_each_thread()" can.
>
> do/while_each_thread oves over all threads in the system, rather than
> just the threadgroup... this isn't supposed to be a fast operation, but
> that seems like overkill.
What are you talking about? ;)
do/while_each_thread iterates over threadgroup.
Oleg.
On 06/03, Ben Blum wrote:
>
> On Wed, Jun 02, 2010 at 10:58:55PM +0200, Oleg Nesterov wrote:
> > Hmm. The usage of ->thread_group in ->can_attach() methods doesn't
> > look safe to me... but currently bool threadgroup is always false.
>
> I recall putting a rcu_read_lock() around that part and being assured
> that made it safe. But I don't remember the details. Maybe taking
> tasklist_lock is necessary?
rcu_read_lock() is not enough, see another email I sent.
Once again.
rcu_read_lock()
list_for_each_rcu(tsk->thread_group)
assumes that at least tsk->thread_group->next can't point to nowhere,
this is not true. This memory can go away _before_ we take rcu lock.
Oleg.