with CONFIG_PROVE_RCU=y, a warning can be triggered:
# mount -t cgroup -o debug xxx /mnt
# cat /proc/$$/cgroup
...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...
This is a false-positive, because cgroup_path() can be called
with either rcu_read_lock() held or cgroup_mutex held.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/cgroup.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e1..4ca928d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
char *start;
- struct dentry *dentry = rcu_dereference(cgrp->dentry);
+ struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+ rcu_read_lock_held() ||
+ cgroup_lock_is_held());
if (!dentry || cgrp == dummytop) {
/*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
*--start = '\0';
for (;;) {
int len = dentry->d_name.len;
+
if ((start -= len) < buf)
return -ENAMETOOLONG;
- memcpy(start, cgrp->dentry->d_name.name, len);
+ memcpy(start, dentry->d_name.name, len);
cgrp = cgrp->parent;
if (!cgrp)
break;
- dentry = rcu_dereference(cgrp->dentry);
+
+ dentry = rcu_dereference_check(cgrp->dentry,
+ rcu_read_lock_held() ||
+ cgroup_lock_is_held());
if (!cgrp->parent)
continue;
if (--start < buf)
--
1.6.3
With CONFIG_PROVE_RCU=y, a warning can be triggered:
# mount -t cgroup -o memory xxx /mnt
# mkdir /mnt/0
...
kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
...
This is a false-positive. It's safe to directly access parent_css->id.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/cgroup.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4ca928d..3a53c77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4561,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
{
int subsys_id, i, depth = 0;
struct cgroup_subsys_state *parent_css, *child_css;
- struct css_id *child_id, *parent_id = NULL;
+ struct css_id *child_id, *parent_id;
subsys_id = ss->subsys_id;
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
- depth = css_depth(parent_css) + 1;
parent_id = parent_css->id;
+ depth = parent_id->depth;
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
--
1.6.3
With CONFIG_PROVE_RCU=y, a warning can be triggered:
$ cat /proc/sched_debug
...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...
Both cgroup_path() and task_group() should be called with either
rcu_read_lock or cgroup_mutex held.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/sched_debug.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9cf1baf..87a330a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
char path[64];
+ rcu_read_lock();
cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+ rcu_read_unlock();
SEQ_printf(m, " %s", path);
}
#endif
--
1.6.3
with CONFIG_PROVE_RCU, a warning can be triggered when we
resume from suspend:
...
include/linux/cgroup.h:533 invoked rcu_dereference_check() without protection!
...
task_freezer() calls task_subsys_state(), which needs to be
protected by rcu_read_lock or cgroup_mutex.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/cgroup_freezer.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 5038f4c..ac76983 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -53,6 +53,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
struct freezer *freezer;
enum freezer_state state;
+ rcu_read_lock();
task_lock(task);
freezer = task_freezer(task);
if (!freezer->css.cgroup->parent)
@@ -60,6 +61,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
else
state = freezer->state;
task_unlock(task);
+ rcu_read_unlock();
return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
}
--
1.6.3
with CONFIG_PROVE_RCU=y, a warning can be triggered:
# mount -t cgroup -o blkio xxx /mnt
# mkdir /mnt/subgroup
...
kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
...
To fix this, we avoid caling css_depth() here, which is a bit simpler
than the original code.
Signed-off-by: Li Zefan <[email protected]>
---
block/blk-cgroup.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5fe03de..2cc682b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -286,16 +286,16 @@ done:
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
- struct blkio_cgroup *blkcg, *parent_blkcg;
+ struct blkio_cgroup *blkcg;
+ struct cgroup *parent = cgroup->parent;
- if (!cgroup->parent) {
+ if (!parent) {
blkcg = &blkio_root_cgroup;
goto done;
}
/* Currently we do not support hierarchy deeper than two level (0,1) */
- parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
- if (css_depth(&parent_blkcg->css) > 0)
+ if (parent != cgroup->top_cgroup)
return ERR_PTR(-EINVAL);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
--
1.6.3
On Thu, 2010-04-22 at 17:30 +0800, Li Zefan wrote:
> With CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> $ cat /proc/sched_debug
>
> ...
> kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
> ...
>
> Both cgroup_path() and task_group() should be called with either
> rcu_read_lock or cgroup_mutex held.
Well, that's not strictly true, but yes in this case it appears to be a
genuine race, since only tasklist_lock is held and that doesn't protect
us from the task changing groups (and thus the current group from going
away on us).
You can also pin a cgroup by holding whatever locks are held in the
->attach method. But the RCU annotation doesn't know (nor reasonably can
know about that).
> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/sched_debug.c | 2 ++
> 1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
> index 9cf1baf..87a330a 100644
> --- a/kernel/sched_debug.c
> +++ b/kernel/sched_debug.c
> @@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
> {
> char path[64];
>
> + rcu_read_lock();
> cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
> + rcu_read_unlock();
> SEQ_printf(m, " %s", path);
> }
> #endif
On Thu, 2010-04-22 at 17:31 +0800, Li Zefan wrote:
> with CONFIG_PROVE_RCU, a warning can be triggered when we
> resume from suspend:
>
> ...
> include/linux/cgroup.h:533 invoked rcu_dereference_check() without protection!
> ...
>
> task_freezer() calls task_subsys_state(), which needs to be
> protected by rcu_read_lock or cgroup_mutex.
>
> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/cgroup_freezer.c | 2 ++
> 1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> index 5038f4c..ac76983 100644
> --- a/kernel/cgroup_freezer.c
> +++ b/kernel/cgroup_freezer.c
> @@ -53,6 +53,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> struct freezer *freezer;
> enum freezer_state state;
>
> + rcu_read_lock();
> task_lock(task);
> freezer = task_freezer(task);
> if (!freezer->css.cgroup->parent)
> @@ -60,6 +61,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> else
> state = freezer->state;
> task_unlock(task);
> + rcu_read_unlock();
>
> return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
> }
Hmm cgroup_attach_task() does hold task_lock() over setting
tsk->cgroups, so doesn't that also pin the task to the cgroup and thus
the cgroup itself?
On Thu, Apr 22, 2010 at 05:32:28PM +0800, Li Zefan wrote:
> with CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> # mount -t cgroup -o blkio xxx /mnt
> # mkdir /mnt/subgroup
>
> ...
> kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
> ...
>
IIUC, so blkiocg_create() is being called with cgroup_mutex held and not
with rcu read lock held. Hence rcu_dereference() in css_depth() gives
warning.
So one easy solution is to don't use css_depth() at all. In this case
simple check like cgroup->top_cgroup should suffice. Makese sense to
me.
Acked-by: Vivek Goyal <[email protected]>
Thanks
Vivek
> To fix this, we avoid caling css_depth() here, which is a bit simpler
> than the original code.
>
> Signed-off-by: Li Zefan <[email protected]>
> ---
> block/blk-cgroup.c | 8 ++++----
> 1 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 5fe03de..2cc682b 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -286,16 +286,16 @@ done:
> static struct cgroup_subsys_state *
> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> {
> - struct blkio_cgroup *blkcg, *parent_blkcg;
> + struct blkio_cgroup *blkcg;
> + struct cgroup *parent = cgroup->parent;
>
> - if (!cgroup->parent) {
> + if (!parent) {
> blkcg = &blkio_root_cgroup;
> goto done;
> }
>
> /* Currently we do not support hierarchy deeper than two level (0,1) */
> - parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> - if (css_depth(&parent_blkcg->css) > 0)
> + if (parent != cgroup->top_cgroup)
> return ERR_PTR(-EINVAL);
>
> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> --
> 1.6.3
On Thu, Apr 22, 2010 at 05:30:40PM +0800, Li Zefan wrote:
> With CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> $ cat /proc/sched_debug
>
> ...
> kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
> ...
>
> Both cgroup_path() and task_group() should be called with either
> rcu_read_lock or cgroup_mutex held.
Queued for 2.6.34, thank you!
Thanx, Paul
> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/sched_debug.c | 2 ++
> 1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
> index 9cf1baf..87a330a 100644
> --- a/kernel/sched_debug.c
> +++ b/kernel/sched_debug.c
> @@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
> {
> char path[64];
>
> + rcu_read_lock();
> cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
> + rcu_read_unlock();
> SEQ_printf(m, " %s", path);
> }
> #endif
> --
> 1.6.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Thu, Apr 22, 2010 at 05:30:00PM +0800, Li Zefan wrote:
> With CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> # mount -t cgroup -o memory xxx /mnt
> # mkdir /mnt/0
>
> ...
> kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
> ...
>
> This is a false-positive. It's safe to directly access parent_css->id.
Also queued for 2.6.34, thank you!
Thanx, Paul
> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/cgroup.c | 4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 4ca928d..3a53c77 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4561,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
> {
> int subsys_id, i, depth = 0;
> struct cgroup_subsys_state *parent_css, *child_css;
> - struct css_id *child_id, *parent_id = NULL;
> + struct css_id *child_id, *parent_id;
>
> subsys_id = ss->subsys_id;
> parent_css = parent->subsys[subsys_id];
> child_css = child->subsys[subsys_id];
> - depth = css_depth(parent_css) + 1;
> parent_id = parent_css->id;
> + depth = parent_id->depth;
>
> child_id = get_new_cssid(ss, depth);
> if (IS_ERR(child_id))
> --
> 1.6.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Thu, Apr 22, 2010 at 05:29:24PM +0800, Li Zefan wrote:
> with CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> # mount -t cgroup -o debug xxx /mnt
> # cat /proc/$$/cgroup
>
> ...
> kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
> ...
>
> This is a false-positive, because cgroup_path() can be called
> with either rcu_read_lock() held or cgroup_mutex held.
Queued for 2.6.34, thank you Li!
Thanx, Paul
> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/cgroup.c | 12 +++++++++---
> 1 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index e2769e1..4ca928d 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
> int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
> {
> char *start;
> - struct dentry *dentry = rcu_dereference(cgrp->dentry);
> + struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
> + rcu_read_lock_held() ||
> + cgroup_lock_is_held());
>
> if (!dentry || cgrp == dummytop) {
> /*
> @@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
> *--start = '\0';
> for (;;) {
> int len = dentry->d_name.len;
> +
> if ((start -= len) < buf)
> return -ENAMETOOLONG;
> - memcpy(start, cgrp->dentry->d_name.name, len);
> + memcpy(start, dentry->d_name.name, len);
> cgrp = cgrp->parent;
> if (!cgrp)
> break;
> - dentry = rcu_dereference(cgrp->dentry);
> +
> + dentry = rcu_dereference_check(cgrp->dentry,
> + rcu_read_lock_held() ||
> + cgroup_lock_is_held());
> if (!cgrp->parent)
> continue;
> if (--start < buf)
> --
> 1.6.3
>
On Thu, Apr 22, 2010 at 05:32:28PM +0800, Li Zefan wrote:
> with CONFIG_PROVE_RCU=y, a warning can be triggered:
>
> # mount -t cgroup -o blkio xxx /mnt
> # mkdir /mnt/subgroup
>
> ...
> kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
> ...
>
> To fix this, we avoid caling css_depth() here, which is a bit simpler
> than the original code.
Queued for 2.6.34, thank you!
Thanx, Paul
> Signed-off-by: Li Zefan <[email protected]>
> ---
> block/blk-cgroup.c | 8 ++++----
> 1 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 5fe03de..2cc682b 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -286,16 +286,16 @@ done:
> static struct cgroup_subsys_state *
> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> {
> - struct blkio_cgroup *blkcg, *parent_blkcg;
> + struct blkio_cgroup *blkcg;
> + struct cgroup *parent = cgroup->parent;
>
> - if (!cgroup->parent) {
> + if (!parent) {
> blkcg = &blkio_root_cgroup;
> goto done;
> }
>
> /* Currently we do not support hierarchy deeper than two level (0,1) */
> - parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> - if (css_depth(&parent_blkcg->css) > 0)
> + if (parent != cgroup->top_cgroup)
> return ERR_PTR(-EINVAL);
>
> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> --
> 1.6.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Thu, 2010-04-22 at 12:59 -0700, Paul E. McKenney wrote:
> On Thu, Apr 22, 2010 at 02:27:55PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-04-22 at 17:31 +0800, Li Zefan wrote:
> > > with CONFIG_PROVE_RCU, a warning can be triggered when we
> > > resume from suspend:
> > >
> > > ...
> > > include/linux/cgroup.h:533 invoked rcu_dereference_check() without protection!
> > > ...
> > >
> > > task_freezer() calls task_subsys_state(), which needs to be
> > > protected by rcu_read_lock or cgroup_mutex.
> > >
> > > Signed-off-by: Li Zefan <[email protected]>
> > > ---
> > > kernel/cgroup_freezer.c | 2 ++
> > > 1 files changed, 2 insertions(+), 0 deletions(-)
> > >
> > > diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> > > index 5038f4c..ac76983 100644
> > > --- a/kernel/cgroup_freezer.c
> > > +++ b/kernel/cgroup_freezer.c
> > > @@ -53,6 +53,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> > > struct freezer *freezer;
> > > enum freezer_state state;
> > >
> > > + rcu_read_lock();
> > > task_lock(task);
> > > freezer = task_freezer(task);
> > > if (!freezer->css.cgroup->parent)
> > > @@ -60,6 +61,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> > > else
> > > state = freezer->state;
> > > task_unlock(task);
> > > + rcu_read_unlock();
> > >
> > > return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
> > > }
> >
> > Hmm cgroup_attach_task() does hold task_lock() over setting
> > tsk->cgroups, so doesn't that also pin the task to the cgroup and thus
> > the cgroup itself?
>
> So you are advocating for the rcu_dereference check including the
> task lock, correct?
I think that might be correct yes, although I would prefer confirmation
from someone who actually knows kernel/cgroup.c ;-)
On Thu, Apr 22, 2010 at 02:27:55PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-04-22 at 17:31 +0800, Li Zefan wrote:
> > with CONFIG_PROVE_RCU, a warning can be triggered when we
> > resume from suspend:
> >
> > ...
> > include/linux/cgroup.h:533 invoked rcu_dereference_check() without protection!
> > ...
> >
> > task_freezer() calls task_subsys_state(), which needs to be
> > protected by rcu_read_lock or cgroup_mutex.
> >
> > Signed-off-by: Li Zefan <[email protected]>
> > ---
> > kernel/cgroup_freezer.c | 2 ++
> > 1 files changed, 2 insertions(+), 0 deletions(-)
> >
> > diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
> > index 5038f4c..ac76983 100644
> > --- a/kernel/cgroup_freezer.c
> > +++ b/kernel/cgroup_freezer.c
> > @@ -53,6 +53,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> > struct freezer *freezer;
> > enum freezer_state state;
> >
> > + rcu_read_lock();
> > task_lock(task);
> > freezer = task_freezer(task);
> > if (!freezer->css.cgroup->parent)
> > @@ -60,6 +61,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
> > else
> > state = freezer->state;
> > task_unlock(task);
> > + rcu_read_unlock();
> >
> > return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
> > }
>
> Hmm cgroup_attach_task() does hold task_lock() over setting
> tsk->cgroups, so doesn't that also pin the task to the cgroup and thus
> the cgroup itself?
So you are advocating for the rcu_dereference check including the
task lock, correct?
Thanx, Paul
On Thu, Apr 22, 2010 at 12:20:04PM +0200, Peter Zijlstra wrote:
<snip>
> You can also pin a cgroup by holding whatever locks are held in the
> ->attach method. But the RCU annotation doesn't know (nor reasonably can
> know about that).
For my future reference, what's the right way to "fix" these
situations with the RCU annotations? (I think your response re: Li's
freezer patch was correct and illustrates the problem I'm referring to.)
Cheers,
-Matt Helsley
On Thu, Apr 22, 2010 at 02:12:12PM -0700, Matt Helsley wrote:
> On Thu, Apr 22, 2010 at 12:20:04PM +0200, Peter Zijlstra wrote:
>
> <snip>
>
> > You can also pin a cgroup by holding whatever locks are held in the
> > ->attach method. But the RCU annotation doesn't know (nor reasonably can
> > know about that).
>
> For my future reference, what's the right way to "fix" these
> situations with the RCU annotations? (I think your response re: Li's
> freezer patch was correct and illustrates the problem I'm referring to.)
There are several variants of rcu_dereference() for this purpose:
o rcu_dereference() for code that is always called within an
RCU read-side critical section -- rcu_read_lock().
o rcu_dereference_bh() for code that is always called within
an RCU-bh read-side critical section -- rcu_read_lock_bh().
o rcu_dereference_sched() for code that is always called within
an RCU-sched read-side critical section -- either
rcu_read_lock_sched() or any other primitive that disables
preemption.
o srcu_dereference() for code that is always called within an
SRCU read-side critical section -- srcu_read_lock().
o rcu_dereference_check() for code that might be called either
in a read-side critical section or on the update side. This
interface takes a second argument, which is normally a logical
conjunction of rcu_read_lock_held(), rcu_read_lock_sched_held(),
rcu_read_lock_bh_held(), and srcu_read_lock_held for the read
side, and lockdep_is_held() for the update side.
o rcu_dereference_raw(p) is rcu_dereference_check(p, 1). Expect
to be asked hard questions if you include this in your patch.
One legitimate use is for initialization and cleanup code
sequences where the current CPU is the only one with access
to the RCU-protected data structure in question.
o rcu_dereference_protected() for code that is only called from
the update side. This interface takes a second argument,
which would normally only have a logical conjunction of
lockdep_is_held() invocations. Don't even think about putting
rcu_read_lock_held() and friends here!!!
o rcu_access_pointer() for code that only tests the value of
the RCU-protected pointer, for example, against NULL.
There you have it!
Thanx, Paul
On Fri, 2010-04-23 at 09:05 +0800, Li Zefan wrote:
>
>
> You are right in that taking task_lock() is sufficient (I forgot
> this lock rule), but it's not true that whatever locks are held
> in the ->attach method can pin a task's cgroup.
Ah, can you be more specific about the ->attach() case?
The way I read it, cgroup_attach_task():
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk, false);
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);
So if you hold a lock that any of those ->attach() methods will use, it
will in fact delay the put_css_set().
Ah, indeed I see your point, it doesn't indeed pin the task to the
cgroup, but does avoid the cgroup from being freed.
Hrmm,.. so anything wanting to really pin a task to its cgroup will have
to use task_lock()? I'll have to see if that works for
sched_setscheduler().
>>>> with CONFIG_PROVE_RCU, a warning can be triggered when we
>>>> resume from suspend:
>>>>
>>>> ...
>>>> include/linux/cgroup.h:533 invoked rcu_dereference_check() without protection!
>>>> ...
>>>>
>>>> task_freezer() calls task_subsys_state(), which needs to be
>>>> protected by rcu_read_lock or cgroup_mutex.
>>>>
>>>> Signed-off-by: Li Zefan <[email protected]>
>>>> ---
>>>> kernel/cgroup_freezer.c | 2 ++
>>>> 1 files changed, 2 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
>>>> index 5038f4c..ac76983 100644
>>>> --- a/kernel/cgroup_freezer.c
>>>> +++ b/kernel/cgroup_freezer.c
>>>> @@ -53,6 +53,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
>>>> struct freezer *freezer;
>>>> enum freezer_state state;
>>>>
>>>> + rcu_read_lock();
>>>> task_lock(task);
>>>> freezer = task_freezer(task);
>>>> if (!freezer->css.cgroup->parent)
>>>> @@ -60,6 +61,7 @@ int cgroup_freezing_or_frozen(struct task_struct *task)
>>>> else
>>>> state = freezer->state;
>>>> task_unlock(task);
>>>> + rcu_read_unlock();
>>>>
>>>> return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
>>>> }
>>> Hmm cgroup_attach_task() does hold task_lock() over setting
>>> tsk->cgroups, so doesn't that also pin the task to the cgroup and thus
>>> the cgroup itself?
>> So you are advocating for the rcu_dereference check including the
>> task lock, correct?
>
> I think that might be correct yes, although I would prefer confirmation
> from someone who actually knows kernel/cgroup.c ;-)
>
You are right in that taking task_lock() is sufficient (I forgot
this lock rule), but it's not true that whatever locks are held
in the ->attach method can pin a task's cgroup.
So the right fix is including task_lock in rcu_deref check in
task_subsys_state(). I'll send a new fix.