LinuxLists.cc - [RFC][PATCH] cgroups: Run subsystem fork callback from cgroup_post

2012-02-24 04:23:40

Subject: [RFC][PATCH] cgroups: Run subsystem fork callback from cgroup_post_fork()

When a user freezes a cgroup, the freezer sets the subsystem state
to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.

But there is a possible race here, although unlikely, if a task
forks and the parent is preempted between write_unlock(tasklist_lock)
and cgroup_post_fork(). If we freeze the cgroup while the parent
is sleeping and the parent wakes up thereafter, its child will
be missing from the set of tasks to freeze because:

- The child was not yet linked to its css_set->tasks, as is done
from cgroup_post_fork(). cgroup_iter_start() has thus missed it.

- The cgroup freezer's fork callback can handle that child but
cgroup_fork_callbacks() has been called already.

One way to fix this is to call the fork callbacks after we link
the task to the css set. The cgroup freezer is the only user of
this callback anyway.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Mandeep Singh Baines <[email protected]>
---

Not sure this is the right solution, especially as I still need
a cancellable fork callback for my task counter and for this I
need the fork callbacks to be called before the task is added
on the tasklist. But anyway at least that reports this race.

include/linux/cgroup.h | 4 ++--
kernel/cgroup.c | 44 ++++++++++++++++----------------------------
kernel/exit.c | 2 +-
kernel/fork.c | 8 --------
4 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 501adb1..1d3f3ce 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -34,7 +34,7 @@ extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
extern void cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_exit(struct task_struct *p);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
extern int cgroup_load_subsys(struct cgroup_subsys *ss);
@@ -611,7 +611,7 @@ static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
static inline void cgroup_fork_callbacks(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
-static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_exit(struct task_struct *p) {}

static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c6877fe..bdf874b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4496,31 +4496,6 @@ void cgroup_fork(struct task_struct *child)
}

/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- /*
- * forkexit callbacks are only supported for builtin
- * subsystems, and the builtin section of the subsys array is
- * immutable, so we don't need to lock the subsys array here.
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -4559,11 +4534,24 @@ void cgroup_post_fork(struct task_struct *child)
}
write_unlock(&css_set_lock);
}
+
+ if (need_forkexit_callback) {
+ int i;
+ /*
+ * forkexit callbacks are only supported for builtin
+ * subsystems, and the builtin section of the subsys array is
+ * immutable, so we don't need to lock the subsys array here.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->fork)
+ ss->fork(child);
+ }
+ }
}
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -4595,7 +4583,7 @@ void cgroup_post_fork(struct task_struct *child)
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*/
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
{
struct css_set *cg;
int i;
@@ -4617,7 +4605,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;

- if (run_callbacks && need_forkexit_callback) {
+ if (need_forkexit_callback) {
/*
* modular subsystems can't use callbacks, so no need to lock
* the subsys array
diff --git a/kernel/exit.c b/kernel/exit.c
index 294b170..d975233 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -990,7 +990,7 @@ void do_exit(long code)
*/
perf_event_exit_task(tsk);

- cgroup_exit(tsk, 1);
+ cgroup_exit(tsk);

if (group_dead)
disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index 051f090..d016fe9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1053,7 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;

if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1305,12 +1304,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);

- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);

@@ -1413,7 +1406,6 @@ bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
--
1.7.5.4

2012-02-24 04:32:59

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [RFC][PATCH] cgroups: Run subsystem fork callback from cgroup_post_fork()

On Fri, Feb 24, 2012 at 05:23:30AM +0100, Frederic Weisbecker wrote:
> When a user freezes a cgroup, the freezer sets the subsystem state
> to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
>
> But there is a possible race here, although unlikely, if a task
> forks and the parent is preempted between write_unlock(tasklist_lock)
> and cgroup_post_fork(). If we freeze the cgroup while the parent
> is sleeping and the parent wakes up thereafter, its child will
> be missing from the set of tasks to freeze because:
>
> - The child was not yet linked to its css_set->tasks, as is done
> from cgroup_post_fork(). cgroup_iter_start() has thus missed it.
>
> - The cgroup freezer's fork callback can handle that child but
> cgroup_fork_callbacks() has been called already.
>
> One way to fix this is to call the fork callbacks after we link
> the task to the css set. The cgroup freezer is the only user of
> this callback anyway.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Li Zefan <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Mandeep Singh Baines <[email protected]>
> ---
>
> Not sure this is the right solution, especially as I still need
> a cancellable fork callback for my task counter and for this I
> need the fork callbacks to be called before the task is added
> on the tasklist. But anyway at least that reports this race.
>
> include/linux/cgroup.h | 4 ++--
> kernel/cgroup.c | 44 ++++++++++++++++----------------------------
> kernel/exit.c | 2 +-
> kernel/fork.c | 8 --------
> 4 files changed, 19 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 501adb1..1d3f3ce 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -34,7 +34,7 @@ extern void cgroup_unlock(void);
> extern void cgroup_fork(struct task_struct *p);
> extern void cgroup_fork_callbacks(struct task_struct *p);
> extern void cgroup_post_fork(struct task_struct *p);
> -extern void cgroup_exit(struct task_struct *p, int run_callbacks);
> +extern void cgroup_exit(struct task_struct *p);
> extern int cgroupstats_build(struct cgroupstats *stats,
> struct dentry *dentry);
> extern int cgroup_load_subsys(struct cgroup_subsys *ss);
> @@ -611,7 +611,7 @@ static inline int cgroup_init(void) { return 0; }
> static inline void cgroup_fork(struct task_struct *p) {}
> static inline void cgroup_fork_callbacks(struct task_struct *p) {}
> static inline void cgroup_post_fork(struct task_struct *p) {}
> -static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
> +static inline void cgroup_exit(struct task_struct *p) {}
>
> static inline void cgroup_lock(void) {}
> static inline void cgroup_unlock(void) {}
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index c6877fe..bdf874b 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4496,31 +4496,6 @@ void cgroup_fork(struct task_struct *child)
> }
>
> /**
> - * cgroup_fork_callbacks - run fork callbacks
> - * @child: the new task
> - *
> - * Called on a new task very soon before adding it to the
> - * tasklist. No need to take any locks since no-one can
> - * be operating on this task.
> - */
> -void cgroup_fork_callbacks(struct task_struct *child)
> -{
> - if (need_forkexit_callback) {
> - int i;
> - /*
> - * forkexit callbacks are only supported for builtin
> - * subsystems, and the builtin section of the subsys array is
> - * immutable, so we don't need to lock the subsys array here.
> - */
> - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> - struct cgroup_subsys *ss = subsys[i];
> - if (ss->fork)
> - ss->fork(child);
> - }
> - }
> -}
> -
> -/**
> * cgroup_post_fork - called on a new task after adding it to the task list
> * @child: the task in question
> *
> @@ -4559,11 +4534,24 @@ void cgroup_post_fork(struct task_struct *child)
> }
> write_unlock(&css_set_lock);
> }
> +
> + if (need_forkexit_callback) {
> + int i;
> + /*
> + * forkexit callbacks are only supported for builtin
> + * subsystems, and the builtin section of the subsys array is
> + * immutable, so we don't need to lock the subsys array here.
> + */
> + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (ss->fork)
> + ss->fork(child);
> + }
> + }
> }
> /**
> * cgroup_exit - detach cgroup from exiting task
> * @tsk: pointer to task_struct of exiting process
> - * @run_callback: run exit callbacks?
> *
> * Description: Detach cgroup from @tsk and release it.
> *
> @@ -4595,7 +4583,7 @@ void cgroup_post_fork(struct task_struct *child)
> * which wards off any cgroup_attach_task() attempts, or task is a failed
> * fork, never visible to cgroup_attach_task.
> */
> -void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> +void cgroup_exit(struct task_struct *tsk)
> {
> struct css_set *cg;
> int i;
> @@ -4617,7 +4605,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
> cg = tsk->cgroups;
> tsk->cgroups = &init_css_set;
>
> - if (run_callbacks && need_forkexit_callback) {
> + if (need_forkexit_callback) {
> /*
> * modular subsystems can't use callbacks, so no need to lock
> * the subsys array
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 294b170..d975233 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -990,7 +990,7 @@ void do_exit(long code)
> */
> perf_event_exit_task(tsk);
>
> - cgroup_exit(tsk, 1);
> + cgroup_exit(tsk);
>
> if (group_dead)
> disassociate_ctty(1);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 051f090..d016fe9 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1053,7 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> {
> int retval;
> struct task_struct *p;
> - int cgroup_callbacks_done = 0;
>
> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> return ERR_PTR(-EINVAL);
> @@ -1305,12 +1304,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> p->group_leader = p;
> INIT_LIST_HEAD(&p->thread_group);
>
> - /* Now that the task is set up, run cgroup callbacks if
> - * necessary. We need to run them before the task is visible
> - * on the tasklist. */
> - cgroup_fork_callbacks(p);
> - cgroup_callbacks_done = 1;
> -
> /* Need tasklist lock for parent etc handling! */
> write_lock_irq(&tasklist_lock);
>
> @@ -1413,7 +1406,6 @@ bad_fork_cleanup_cgroup:
> #endif
> if (clone_flags & CLONE_THREAD)
> threadgroup_change_end(current);
> - cgroup_exit(p, cgroup_callbacks_done);

Ah, we still need the cgroup_exit() to put the css_set.
Will fix.

> delayacct_tsk_free(p);
> module_put(task_thread_info(p)->exec_domain->module);
> bad_fork_cleanup_count:
> --
> 1.7.5.4
>

2012-02-27 17:03:00

by Frederic Weisbecker

[permalink] [raw]

Subject: [RFC][PATCH v2] cgroups: Run subsystem fork callback from cgroup_post_fork()

2012-02-29 15:55:16

by Mandeep Singh Baines

[permalink] [raw]

Subject: Re: [RFC][PATCH v2] cgroups: Run subsystem fork callback from cgroup_post_fork()

Frederic Weisbecker ([email protected]) wrote:
> When a user freezes a cgroup, the freezer sets the subsystem state
> to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
>
> But there is a possible race here, although unlikely, if a task
> forks and the parent is preempted between write_unlock(tasklist_lock)
> and cgroup_post_fork(). If we freeze the cgroup while the parent

So what if you moved cgroup_post_forks() a few lines up to be
inside the tasklist_lock?

I agree with you on the race and believe your solution is correct.

> is sleeping and the parent wakes up thereafter, its child will
> be missing from the set of tasks to freeze because:
>
> - The child was not yet linked to its css_set->tasks, as is done
> from cgroup_post_fork(). cgroup_iter_start() has thus missed it.
>
> - The cgroup freezer's fork callback can handle that child but
> cgroup_fork_callbacks() has been called already.
>
> One way to fix this is to call the fork callbacks after we link
> the task to the css set. The cgroup freezer is the only user of
> this callback anyway.
>
> v2: Keep the call to cgroup_exit to put the css_set on fork error.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Li Zefan <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Mandeep Singh Baines <[email protected]>
> ---
>
> Not sure this is the right solution, especially as I still need
> a cancellable fork callback for my task counter and for this I
> need the fork callbacks to be called before the task is added
> on the tasklist. But anyway at least that reports this race.
>

I'm new to the task counter stuff. Would you mind providing a
reference.

Regards,
Mandeep

> kernel/cgroup.c | 39 ++++++++++++++-------------------------
> kernel/fork.c | 9 +--------
> 2 files changed, 15 insertions(+), 33 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index c6877fe..de21e52 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4496,31 +4496,6 @@ void cgroup_fork(struct task_struct *child)
> }
>
> /**
> - * cgroup_fork_callbacks - run fork callbacks
> - * @child: the new task
> - *
> - * Called on a new task very soon before adding it to the
> - * tasklist. No need to take any locks since no-one can
> - * be operating on this task.
> - */
> -void cgroup_fork_callbacks(struct task_struct *child)
> -{
> - if (need_forkexit_callback) {
> - int i;
> - /*
> - * forkexit callbacks are only supported for builtin
> - * subsystems, and the builtin section of the subsys array is
> - * immutable, so we don't need to lock the subsys array here.
> - */
> - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> - struct cgroup_subsys *ss = subsys[i];
> - if (ss->fork)
> - ss->fork(child);
> - }
> - }
> -}
> -
> -/**
> * cgroup_post_fork - called on a new task after adding it to the task list
> * @child: the task in question
> *
> @@ -4559,6 +4534,20 @@ void cgroup_post_fork(struct task_struct *child)
> }
> write_unlock(&css_set_lock);
> }
> +
> + if (need_forkexit_callback) {
> + int i;
> + /*
> + * forkexit callbacks are only supported for builtin
> + * subsystems, and the builtin section of the subsys array is
> + * immutable, so we don't need to lock the subsys array here.
> + */
> + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (ss->fork)
> + ss->fork(child);
> + }
> + }
> }
> /**
> * cgroup_exit - detach cgroup from exiting task
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 051f090..551cfe0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1053,7 +1053,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> {
> int retval;
> struct task_struct *p;
> - int cgroup_callbacks_done = 0;
>
> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> return ERR_PTR(-EINVAL);
> @@ -1305,12 +1304,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> p->group_leader = p;
> INIT_LIST_HEAD(&p->thread_group);
>
> - /* Now that the task is set up, run cgroup callbacks if
> - * necessary. We need to run them before the task is visible
> - * on the tasklist. */
> - cgroup_fork_callbacks(p);
> - cgroup_callbacks_done = 1;
> -
> /* Need tasklist lock for parent etc handling! */
> write_lock_irq(&tasklist_lock);
>
> @@ -1413,7 +1406,7 @@ bad_fork_cleanup_cgroup:
> #endif
> if (clone_flags & CLONE_THREAD)
> threadgroup_change_end(current);
> - cgroup_exit(p, cgroup_callbacks_done);
> + cgroup_exit(p, 0);
> delayacct_tsk_free(p);
> module_put(task_thread_info(p)->exec_domain->module);
> bad_fork_cleanup_count:
> --
> 1.7.5.4
>

2012-02-29 16:22:16

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [RFC][PATCH v2] cgroups: Run subsystem fork callback from cgroup_post_fork()

On Wed, Feb 29, 2012 at 07:55:00AM -0800, Mandeep Singh Baines wrote:
> Frederic Weisbecker ([email protected]) wrote:
> > When a user freezes a cgroup, the freezer sets the subsystem state
> > to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
> >
> > But there is a possible race here, although unlikely, if a task
> > forks and the parent is preempted between write_unlock(tasklist_lock)
> > and cgroup_post_fork(). If we freeze the cgroup while the parent
>
> So what if you moved cgroup_post_forks() a few lines up to be
> inside the tasklist_lock?

It won't work. Consider this scenario:

CPU 0 CPU 1

cgroup_fork_callbacks()
write_lock(tasklist_lock)
try_to_freeze_cgroup() { add child to task list etc...
cgroup_iter_start()
freeze tasks
cgroup_iter_end()
} cgroup_post_fork()
write_unlock(tasklist_lock)

If this is not the first time we call cgroup_iter_start(), we won't go
through the whole tasklist, we simply iterate through the css set task links.

Plus we try to avoid anything under tasklist_lock when possible.

>
> I agree with you on the race and believe your solution is correct.
>
> > is sleeping and the parent wakes up thereafter, its child will
> > be missing from the set of tasks to freeze because:
> >
> > - The child was not yet linked to its css_set->tasks, as is done
> > from cgroup_post_fork(). cgroup_iter_start() has thus missed it.
> >
> > - The cgroup freezer's fork callback can handle that child but
> > cgroup_fork_callbacks() has been called already.
> >
> > One way to fix this is to call the fork callbacks after we link
> > the task to the css set. The cgroup freezer is the only user of
> > this callback anyway.
> >
> > v2: Keep the call to cgroup_exit to put the css_set on fork error.
> >
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > Cc: Li Zefan <[email protected]>
> > Cc: Tejun Heo <[email protected]>
> > Cc: Oleg Nesterov <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > Cc: Mandeep Singh Baines <[email protected]>
> > ---
> >
> > Not sure this is the right solution, especially as I still need
> > a cancellable fork callback for my task counter and for this I
> > need the fork callbacks to be called before the task is added
> > on the tasklist. But anyway at least that reports this race.
> >
>
> I'm new to the task counter stuff. Would you mind providing a
> reference.

Sure, have a look at this:

https://lkml.org/lkml/2012/1/31/489

Especially this patch:
https://lkml.org/lkml/2012/1/31/495

And this one that implements a fork callback:
https://lkml.org/lkml/2012/1/31/497

The fork callback may return an error to cancel the fork. But doing
this at cgroup_post_fork() time is too late.

2012-03-01 03:17:53

by Li Zefan

[permalink] [raw]

Subject: Re: [RFC][PATCH v2] cgroups: Run subsystem fork callback from cgroup_post_fork()

于 2012年03月01日 00:21, Frederic Weisbecker 写道:
> On Wed, Feb 29, 2012 at 07:55:00AM -0800, Mandeep Singh Baines wrote:
>> Frederic Weisbecker ([email protected]) wrote:
>>> When a user freezes a cgroup, the freezer sets the subsystem state
>>> to CGROUP_FREEZING and then iterates over the tasks in the cgroup links.
>>>
>>> But there is a possible race here, although unlikely, if a task
>>> forks and the parent is preempted between write_unlock(tasklist_lock)
>>> and cgroup_post_fork(). If we freeze the cgroup while the parent
>>
>> So what if you moved cgroup_post_forks() a few lines up to be
>> inside the tasklist_lock?
>
> It won't work. Consider this scenario:
>
> CPU 0 CPU 1
>
> cgroup_fork_callbacks()
> write_lock(tasklist_lock)
> try_to_freeze_cgroup() { add child to task list etc...
> cgroup_iter_start()
> freeze tasks
> cgroup_iter_end()
> } cgroup_post_fork()
> write_unlock(tasklist_lock)
>
> If this is not the first time we call cgroup_iter_start(), we won't go
> through the whole tasklist, we simply iterate through the css set task links.
>
> Plus we try to avoid anything under tasklist_lock when possible.
>

Your patch won't close the race I'm afraid.

// state will be set to FREEZING
echo FROZEN > /cgroup/sub/freezer.state
write_lock(tasklist_lock)
add child to task list ...
write_unlock(tasklist_lock)
// state will be updated to FROZEN
cat /cgroup/sub/freezer.state
cgroup_post_fork()
->freezer_fork()

freezer_fork() will freeze the task only if the cgroup is in FREEZING
state, and will BUG if the state is FROZEN.

We can fix freezer_fork(), but seems that requires we hold cgroup_mutex
in that function(), which we don't like at all. Not to say your
task_counter stuff..

At this moment I don't see a solution without tasklist_lock involved,
any better idea?

(I just realized be patch below introduces a tasklist_lock <-> freezer->lock
ABBA deadlock, so it's bad to screw up with tasklist lock)

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b..74527ac 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -278,6 +278,12 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
struct task_struct *task;
unsigned int num_cant_freeze_now = 0;

+ /*
+ * With this lock held and the check in freezer_fork(), a
+ * half-forked task has no chance to escape from freezing.
+ */
+ read_lock(&tasklist_lock);
+
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
if (!freeze_task(task))
@@ -289,6 +295,8 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
}
cgroup_iter_end(cgroup, &it);

+ read_unlock(&tasklist_lock);
+
return num_cant_freeze_now ? -EBUSY : 0;
}

diff --git a/kernel/fork.c b/kernel/fork.c
index e2cd3e2..2450720 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1328,15 +1328,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);

+ /* Need tasklist lock for parent etc handling! */
+ write_lock_irq(&tasklist_lock);
+
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
cgroup_fork_callbacks(p);
cgroup_callbacks_done = 1;

- /* Need tasklist lock for parent etc handling! */
- write_lock_irq(&tasklist_lock);
-
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
@@ -1393,9 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,

total_forks++;
spin_unlock(&current->sighand->siglock);
+ cgroup_post_fork(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);