[permalink] [raw]

Subject: Re: [RFC][PATCH 1/6] memcg: fix pre_destory handler

On Wed, 10 Dec 2008 16:28:38 -0800
Paul Menage <[email protected]> wrote:

> On Wed, Dec 10, 2008 at 4:25 PM, KAMEZAWA Hiroyuki
> <[email protected]> wrote:
> >>
> >> Can you clarify what you mean by "rolling update of refcnts"?
> >>
> > for(..i++)
> > atomic_dec/inc( refcnt[i)
> >
> > But my first version of this patch did above. I can write it again easily.
>
> I just sent out a small patch collection that had my version of
> css_tryget() in it - is that what you had in mind by "rolling update"?
>
yes. but yours seems to be more robust than my version.
I'll try to use yours and prepare a patch for css_tryget() in memcg.

-Kame

2008-12-11 03:14:19

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [RFC][PATCH 5/6] fix inactive_ratio under hierarchy

Hi

> ex)In following tree,
> /opt/cgroup/01 limit=1G
> /opt/cgroup/01/A limit=500M
> /opt/cgroup/01/A/B limit=unlimited
> /opt/cgroup/01/A/C limit=50M
> /opt/cgroup/01/Z limit=700M
>
>
> /opt/cgroup/01's inactive_ratio is calculated by limit of 1G.
> /opt/cgroup/01/A's inactive_ratio is calculated by limit of 500M
> /opt/cgroup/01/A/B's inactive_ratio is calculated by limit of 500M.
> /opt/cgroup/01/A/C's inactive_ratio is calculated by limit of 50M.
> /opt/cgroup/01's inactive_ratio is calculated by limit of 700M.

this is one of good choice.
but I think it is a bit complex rule.

How about this?

===============
Currently, inactive_ratio of memcg is calculated at setting limit.
because page_alloc.c does so and current implementation is straightforward porting.

However, memcg introduced hierarchy feature recently.
In hierarchy restriction, memory limit is not only decided memory.limit_in_bytes of current cgroup,
but also parent limit and sibling memory usage.

Then, The optimal inactive_ratio is changed frequently.
So, everytime calculation is better.

Signed-off-by: KOSAKI Motohiro <[email protected]>
CC: KAMEZAWA Hiroyuki <[email protected]>
CC: Balbir Singh <[email protected]>
---
include/linux/memcontrol.h | 3 --
mm/memcontrol.c | 64 +++++++++++++++++++++------------------------
mm/vmscan.c | 2 -
3 files changed, 33 insertions(+), 36 deletions(-)

Index: b/include/linux/memcontrol.h
===================================================================
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -95,8 +95,7 @@ extern void mem_cgroup_note_reclaim_prio
int priority);
extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
int priority);
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
- struct zone *zone);
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru);
Index: b/mm/memcontrol.c
===================================================================
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -167,9 +167,6 @@ struct mem_cgroup {

unsigned int swappiness;

-
- unsigned int inactive_ratio;
-
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -433,15 +430,43 @@ void mem_cgroup_record_reclaim_priority(
spin_unlock(&mem->reclaim_param_lock);
}

-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
unsigned long inactive;
+ unsigned long gb;
+ unsigned long inactive_ratio;

inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);

- if (inactive * memcg->inactive_ratio < active)
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ if (present_pages) {
+ present_pages[0] = inactive;
+ present_pages[1] = active;
+ }
+
+ return inactive_ratio;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long present_pages[2];
+ unsigned long inactive_ratio;
+
+ inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+
+ inactive = present_pages[0];
+ active = present_pages[1];
+
+ if (inactive * inactive_ratio < active)
return 1;

return 0;
@@ -1410,29 +1435,6 @@ int mem_cgroup_shrink_usage(struct mm_st
return 0;
}

-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * this calculation is straightforward porting from
- * page_alloc.c::setup_per_zone_inactive_ratio().
- * it describe more detail.
- */
-static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
-{
- unsigned int gb, ratio;
-
- gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
- if (gb)
- ratio = int_sqrt(10 * gb);
- else
- ratio = 1;
-
- memcg->inactive_ratio = ratio;
-
-}
-
static DEFINE_MUTEX(set_limit_mutex);

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -1472,9 +1474,6 @@ static int mem_cgroup_resize_limit(struc
if (!progress) retry_count--;
}

- if (!ret)
- mem_cgroup_set_inactive_ratio(memcg);
-
return ret;
}

@@ -1833,7 +1832,7 @@ static int mem_control_stat_show(struct
}

#ifdef CONFIG_DEBUG_VM
- cb->fill(cb, "inactive_ratio", mem_cont->inactive_ratio);
+ cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));

{
int nid, zid;
@@ -2125,7 +2124,6 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->res, NULL);
res_counter_init(&mem->memsw, NULL);
}
- mem_cgroup_set_inactive_ratio(mem);
mem->last_scanned_child = NULL;
spin_lock_init(&mem->reclaim_param_lock);

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1394,7 +1394,7 @@ static int inactive_anon_is_low(struct z
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
- low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
+ low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}

2008-12-11 03:20:42

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC][PATCH 5/6] fix inactive_ratio under hierarchy

On Thu, 11 Dec 2008 12:14:04 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:

> Hi
>
> > ex)In following tree,
> > /opt/cgroup/01 limit=1G
> > /opt/cgroup/01/A limit=500M
> > /opt/cgroup/01/A/B limit=unlimited
> > /opt/cgroup/01/A/C limit=50M
> > /opt/cgroup/01/Z limit=700M
> >
> >
> > /opt/cgroup/01's inactive_ratio is calculated by limit of 1G.
> > /opt/cgroup/01/A's inactive_ratio is calculated by limit of 500M
> > /opt/cgroup/01/A/B's inactive_ratio is calculated by limit of 500M.
> > /opt/cgroup/01/A/C's inactive_ratio is calculated by limit of 50M.
> > /opt/cgroup/01's inactive_ratio is calculated by limit of 700M.
>
> this is one of good choice.
> but I think it is a bit complex rule.
>
>
> How about this?
>
> ===============
> Currently, inactive_ratio of memcg is calculated at setting limit.
> because page_alloc.c does so and current implementation is straightforward porting.
>
> However, memcg introduced hierarchy feature recently.
> In hierarchy restriction, memory limit is not only decided memory.limit_in_bytes of current cgroup,
> but also parent limit and sibling memory usage.
>
> Then, The optimal inactive_ratio is changed frequently.
> So, everytime calculation is better.
>
thx, I'll test this.

-Kame

>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> CC: KAMEZAWA Hiroyuki <[email protected]>
> CC: Balbir Singh <[email protected]>
> ---
> include/linux/memcontrol.h | 3 --
> mm/memcontrol.c | 64 +++++++++++++++++++++------------------------
> mm/vmscan.c | 2 -
> 3 files changed, 33 insertions(+), 36 deletions(-)
>
> Index: b/include/linux/memcontrol.h
> ===================================================================
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -95,8 +95,7 @@ extern void mem_cgroup_note_reclaim_prio
> int priority);
> extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
> int priority);
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg,
> - struct zone *zone);
> +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> struct zone *zone,
> enum lru_list lru);
> Index: b/mm/memcontrol.c
> ===================================================================
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -167,9 +167,6 @@ struct mem_cgroup {
>
> unsigned int swappiness;
>
> -
> - unsigned int inactive_ratio;
> -
> /*
> * statistics. This must be placed at the end of memcg.
> */
> @@ -433,15 +430,43 @@ void mem_cgroup_record_reclaim_priority(
> spin_unlock(&mem->reclaim_param_lock);
> }
>
> -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
> +static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
> {
> unsigned long active;
> unsigned long inactive;
> + unsigned long gb;
> + unsigned long inactive_ratio;
>
> inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
> active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
>
> - if (inactive * memcg->inactive_ratio < active)
> + gb = (inactive + active) >> (30 - PAGE_SHIFT);
> + if (gb)
> + inactive_ratio = int_sqrt(10 * gb);
> + else
> + inactive_ratio = 1;
> +
> + if (present_pages) {
> + present_pages[0] = inactive;
> + present_pages[1] = active;
> + }
> +
> + return inactive_ratio;
> +}
> +
> +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> +{
> + unsigned long active;
> + unsigned long inactive;
> + unsigned long present_pages[2];
> + unsigned long inactive_ratio;
> +
> + inactive_ratio = calc_inactive_ratio(memcg, present_pages);
> +
> + inactive = present_pages[0];
> + active = present_pages[1];
> +
> + if (inactive * inactive_ratio < active)
> return 1;
>
> return 0;
> @@ -1410,29 +1435,6 @@ int mem_cgroup_shrink_usage(struct mm_st
> return 0;
> }
>
> -/*
> - * The inactive anon list should be small enough that the VM never has to
> - * do too much work, but large enough that each inactive page has a chance
> - * to be referenced again before it is swapped out.
> - *
> - * this calculation is straightforward porting from
> - * page_alloc.c::setup_per_zone_inactive_ratio().
> - * it describe more detail.
> - */
> -static void mem_cgroup_set_inactive_ratio(struct mem_cgroup *memcg)
> -{
> - unsigned int gb, ratio;
> -
> - gb = res_counter_read_u64(&memcg->res, RES_LIMIT) >> 30;
> - if (gb)
> - ratio = int_sqrt(10 * gb);
> - else
> - ratio = 1;
> -
> - memcg->inactive_ratio = ratio;
> -
> -}
> -
> static DEFINE_MUTEX(set_limit_mutex);
>
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> @@ -1472,9 +1474,6 @@ static int mem_cgroup_resize_limit(struc
> if (!progress) retry_count--;
> }
>
> - if (!ret)
> - mem_cgroup_set_inactive_ratio(memcg);
> -
> return ret;
> }
>
> @@ -1833,7 +1832,7 @@ static int mem_control_stat_show(struct
> }
>
> #ifdef CONFIG_DEBUG_VM
> - cb->fill(cb, "inactive_ratio", mem_cont->inactive_ratio);
> + cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
>
> {
> int nid, zid;
> @@ -2125,7 +2124,6 @@ mem_cgroup_create(struct cgroup_subsys *
> res_counter_init(&mem->res, NULL);
> res_counter_init(&mem->memsw, NULL);
> }
> - mem_cgroup_set_inactive_ratio(mem);
> mem->last_scanned_child = NULL;
> spin_lock_init(&mem->reclaim_param_lock);
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1394,7 +1394,7 @@ static int inactive_anon_is_low(struct z
> if (scanning_global_lru(sc))
> low = inactive_anon_is_low_global(zone);
> else
> - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
> + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
> return low;
> }
>
>
>
>

2008-12-11 12:44:34

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [RFC][PATCH 1/6] memcg: fix pre_destory handler

On Wed, 10 Dec 2008 16:24:44 -0800
Paul Menage <[email protected]> wrote:

> On Wed, Dec 10, 2008 at 4:21 PM, KAMEZAWA Hiroyuki
> <[email protected]> wrote:
> > If per-css, looking up function will be
> > ==
> > struct cgroup_subsys_state *cgroup_css_lookup(subsys_id, id)
> > ==
> > Do you mean this ?
>
> Yes, plausibly. And we can presumably have a separate idr per subsystem.
>

this is my temporal patch. this look and interface is ok ?
(still under test)
==

From: KAMEZAWA Hiroyuki <[email protected]>

Patch for Per-CSS ID and private hierarchy code.

This patch tries to assign a ID to each css. Attach unique ID to each
css and provides following functions.

- css_lookup(subsys, id)
returns struct cgroup of id.
- css_get_next(subsys, id, rootid, depth, foundid)
returns the next cgroup under "root" by scanning bitmap (not by tree-walk)

When cgrou_subsys->use_id is set, id field for css is maintained.
kernel/cgroup.c just parepare
- css_id of root css for subsys
- alloc/free id functions.
So, each subsys should allocate ID in attach() callback if necessary.

There is several reasons to develop this.

- While trying to implement hierarchy in memory cgroup, we have to
implement "walk under hierarchy" code.
Now it's consists of cgroup_lock and tree up-down code. Because
Because memory cgroup have to do hierarchy walk in other places,
intelligent processing, we'll reuse the "walk" code.
But taking "cgroup_lock" in walking tree can cause deadlocks.
Easier way is helpful.

- SwapCgroup uses array of "pointer" to record the owner of swaps.
By ID, we can reduce this to "short" or "int". This means ID is
useful for reducing space consumption by pointer if the access cost
is not problem.
(I hear bio-cgroup will use the same kind of...)

Example) OOM-Killer under hierarchy.
do {
rcu_read_lock();
next = cgroup_get_next(id, root, nextid);
/* check sanity of next here */
css_tryget();
rcu_read_unlock();
if (!next)
break;
cgroup_scan_tasks(select_bad_process?);
/* record score here...*/
} while (1);

Characteristics:
- Each css should have unique ID under subsys.
- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
- Allowed ID is 1-65535, ID 0 is UNUSED ID.
-

Design Choices:
- scan-by-ID v.s. scan-by-tree-walk.
As /proc's pid scan does, scan-by-ID is robust when scanning is done
by following kind of routine.
scan -> rest a while(release a lock) -> conitunue from interrupted
memcg's hierarchical reclaim does this.

- When subsys->use_id is set, # of css in the system is limited to
65534.
- max depth of hierarchy is also limited.

Changelog: (v4) -> (v5)
- Totally re-designed as per-css ID.
Changelog:(v3) -> (v4)
- updated comments.
- renamed hierarchy_code[] to stack[]
- merged prepare_id routines.

Changelog (v2) -> (v3)
- removed cgroup_id_getref().
- added cgroup_id_tryget().

Changelog (v1) -> (v2):
- Design change: show only ID(integer) to outside of cgroup.c
- moved cgroup ID definition from include/ to kernel/cgroup.c
- struct cgroup_id is freed by RCU.
- changed interface from pointer to "int"
- kill_sb() is handled.
- ID 0 as unused ID.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/cgroup.h | 51 +++++++++
include/linux/idr.h | 1
kernel/cgroup.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++-
lib/idr.c | 46 ++++++++
4 files changed, 347 insertions(+), 2 deletions(-)

Index: mmotm-2.6.28-Dec10/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.28-Dec10.orig/include/linux/cgroup.h
+++ mmotm-2.6.28-Dec10/include/linux/cgroup.h
@@ -15,13 +15,14 @@
#include <linux/cgroupstats.h>
#include <linux/prio_heap.h>
#include <linux/rwsem.h>
-
+#include <linux/idr.h>
#ifdef CONFIG_CGROUPS

struct cgroupfs_root;
struct cgroup_subsys;
struct inode;
struct cgroup;
+struct css_id;

extern int cgroup_init_early(void);
extern int cgroup_init(void);
@@ -59,6 +60,8 @@ struct cgroup_subsys_state {
atomic_t refcnt;

unsigned long flags;
+ /* ID for this css, if possible */
+ struct css_id *id;
};

/* bits in struct cgroup_subsys_state flags field */
@@ -360,6 +363,13 @@ struct cgroup_subsys {
int active;
int disabled;
int early_init;
+ /*
+ * set 1 if subsys uses ID. ID is not available before cgroup_init()
+ * (not available in early_init time.
+ */
+ int use_id;
+ /* this defines max depth of hierarchy of this subsys if using ID. */
+ int max_depth;
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;

@@ -373,6 +383,9 @@ struct cgroup_subsys {
/* Protected by this->hierarchy_mutex and cgroup_lock() */
struct cgroupfs_root *root;
struct list_head sibling;
+ /* used when use_id == 1 */
+ struct idr idr;
+ spinlock_t id_lock;
};

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -426,6 +439,42 @@ void cgroup_iter_end(struct cgroup *cgrp
int cgroup_scan_tasks(struct cgroup_scanner *scan);
int cgroup_attach_task(struct cgroup *, struct task_struct *);

+/*
+ * CSS ID is a ID for all css struct under subsys. Only works when
+ * cgroup_subsys->use_id != 0. It can be used for look up and scanning
+ * Cgroup ID is assined at cgroup allocation (create) and removed
+ * when refcnt to ID goes down to 0. Refcnt is inremented when subsys want to
+ * avoid reuse of ID for persistent objects. In usual, refcnt to ID will be 0
+ * when cgroup is removed.
+ * This look-up and scan function should be called under rcu_read_lock().
+ * cgroup_lock() is not necessary.
+ *
+ * Note: At using ID, max depth of the hierarchy is determined by
+ * cgroup_subsys->max_id_depth.
+ */
+
+/* called at create() */
+int css_id_alloc(struct cgroup_subsys *ss, struct cgroup_subsys_state *parent,
+ struct cgroup_subsys_state *css);
+/* called at destroy(), or somewhere we can free ID */
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+/* Find a cgroup which the "ID" is attached. */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+/*
+ * Get next cgroup under tree. Returning a cgroup which has equal or greater
+ * ID than "id" in argument.
+ */
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss,
+ int id, int rootid, int depth, int *foundid);
+
+/* get id and depth of css */
+unsigned short css_id(struct cgroup_subsys_state *css);
+unsigned short css_depth(struct cgroup_subsys_state *css);
+/* returns non-zero if root is ancestor of cg */
+int css_is_ancestor(struct cgroup_subsys_state *cg,
+ struct cgroup_subsys_state *root);
+
#else /* !CONFIG_CGROUPS */

static inline int cgroup_init_early(void) { return 0; }
Index: mmotm-2.6.28-Dec10/kernel/cgroup.c
===================================================================
--- mmotm-2.6.28-Dec10.orig/kernel/cgroup.c
+++ mmotm-2.6.28-Dec10/kernel/cgroup.c
@@ -46,7 +46,7 @@
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
-
+#include <linux/idr.h>
#include <asm/atomic.h>

static DEFINE_MUTEX(cgroup_mutex);
@@ -185,6 +185,8 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;

+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@@ -2684,6 +2686,8 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
+ if (ss->use_id)
+ cgroup_subsys_init_idr(ss);
}

/* Add init_css_set to the hash table */
@@ -3215,3 +3219,248 @@ static int __init cgroup_disable(char *s
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * CSS ID
+ */
+struct css_id {
+ /*
+ * The cgroup to whiech this ID points. If cgroup is removed,
+ * this will point to NULL.
+ */
+ struct cgroup_subsys_state *css;
+ /*
+ * ID of this css.
+ */
+ unsigned short id;
+ /*
+ * Depth in hierarchy which this ID belongs to.
+ */
+ unsigned short depth;
+ /*
+ * ID is freed by RCU. (and lookup routine is RCU safe.)
+ */
+ struct rcu_head rcu_head;
+ /*
+ * Hierarchy of CSS ID belongs to.
+ */
+ unsigned short stack[0];
+};
+
+/*
+ * To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+ struct css_id *cssid = rcu_dereference(css->id);
+
+ if (cssid)
+ return cssid->id;
+ return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+ struct css_id *cssid = rcu_dereference(css->id);
+
+ if (cssid)
+ return cssid->depth;
+ return 0;
+}
+
+int css_is_ancestor(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *root)
+{
+ struct css_id *id = css->id;
+ struct css_id *ans = root->id;
+
+ if (!id || !ans)
+ return 0;
+ return id->stack[ans->depth] == ans->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+ struct css_id *id;
+
+ id = container_of(head, struct css_id, rcu_head);
+ kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+ struct css_id *id = css->id;
+
+ BUG_ON(!ss->use_id);
+
+ rcu_assign_pointer(id->css, NULL);
+ rcu_assign_pointer(css->id, NULL);
+ spin_lock(&ss->id_lock);
+ idr_remove(&ss->idr, id->id);
+ spin_unlock(&ss->id_lock);
+ call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+
+static int __get_and_prepare_newid(struct cgroup_subsys *ss,
+ struct css_id **ret)
+{
+ struct css_id *newid;
+ int myid, error, size;
+
+ BUG_ON(!ss->use_id);
+
+ size = sizeof(*newid) + sizeof(unsigned short) * ss->max_depth;
+ newid = kzalloc(size, GFP_KERNEL);
+ if (!newid)
+ return -ENOMEM;
+ /* get id */
+ if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+ error = -ENOMEM;
+ goto err_out;
+ }
+ spin_lock(&ss->id_lock);
+ /* Don't use 0 */
+ error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ spin_unlock(&ss->id_lock);
+
+ /* Returns error when there are no free spaces for new ID.*/
+ if (error) {
+ error = -ENOSPC;
+ goto err_out;
+ }
+
+ newid->id = myid;
+ *ret = newid;
+ return 0;
+err_out:
+ kfree(newid);
+ return error;
+
+}
+
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+ struct css_id *newid;
+ struct cgroup_subsys_state *rootcss;
+ int err = -ENOMEM;
+
+ spin_lock_init(&ss->id_lock);
+ idr_init(&ss->idr);
+
+ rootcss = init_css_set.subsys[ss->subsys_id];
+ err = __get_and_prepare_newid(ss, &newid);
+ if (err)
+ return err;
+
+ newid->depth = 0;
+ newid->stack[0] = newid->id;
+ newid->css = rootcss;
+ rootcss->id = newid;
+ return 0;
+}
+
+int css_id_alloc(struct cgroup_subsys *ss,
+ struct cgroup_subsys_state *parent,
+ struct cgroup_subsys_state *css)
+{
+ int i, depth = 0;
+ struct css_id *cssid, *parent_id = NULL;
+ int error;
+
+ if (parent) {
+ parent_id = parent->id;
+ depth = parent_id->depth + 1;
+ }
+
+ if (depth >= ss->max_depth)
+ return -ENOSPC;
+
+ error = __get_and_prepare_newid(ss, &cssid);
+ if (error)
+ return error;
+
+ for (i = 0; i < depth; i++)
+ cssid->stack[i] = parent_id->stack[i];
+ cssid->stack[depth] = cssid->id;
+ cssid->depth = depth;
+
+ rcu_assign_pointer(cssid->css, css);
+ rcu_assign_pointer(css->id, cssid);
+
+ return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @id: the id of cgroup to be looked up
+ *
+ * Returns pointer to css if there is valid css with id, NULL if not.
+ * Should be called under rcu_read_lock()
+ */
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+ struct cgroup_subsys_state *css = NULL;
+ struct css_id *cssid = NULL;
+
+ BUG_ON(!ss->use_id);
+ rcu_read_lock();
+ cssid = idr_find(&ss->idr, id);
+
+ if (unlikely(!cssid))
+ goto out;
+
+ css = rcu_dereference(cssid->css);
+out:
+ rcu_read_unlock();
+ return css;
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @rootid: search tree under this.
+ * @depth: depth of root id.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss,
+ int id, int rootid, int depth, int *foundid)
+{
+ struct cgroup_subsys_state *ret = NULL;
+ struct css_id *tmp;
+ int tmpid;
+
+ BUG_ON(!ss->use_id);
+ rcu_read_lock();
+ tmpid = id;
+ while (1) {
+ /* scan next entry from bitmap(tree) */
+ spin_lock(&ss->id_lock);
+ tmp = idr_get_next(&ss->idr, &tmpid);
+ spin_unlock(&ss->id_lock);
+
+ if (!tmp) {
+ ret = NULL;
+ break;
+ }
+ if (tmp->stack[depth] == rootid) {
+ ret = rcu_dereference(tmp->css);
+ /* Sanity check and check hierarchy */
+ if (ret && !css_is_removed(ret))
+ break;
+ }
+ tmpid = tmpid + 1;
+ }
+
+ rcu_read_unlock();
+ *foundid = tmpid;
+ return ret;
+}
+
Index: mmotm-2.6.28-Dec10/include/linux/idr.h
===================================================================
--- mmotm-2.6.28-Dec10.orig/include/linux/idr.h
+++ mmotm-2.6.28-Dec10/include/linux/idr.h
@@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *p
int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
int idr_for_each(struct idr *idp,
int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
void *idr_replace(struct idr *idp, void *ptr, int id);
void idr_remove(struct idr *idp, int id);
void idr_remove_all(struct idr *idp);
Index: mmotm-2.6.28-Dec10/lib/idr.c
===================================================================
--- mmotm-2.6.28-Dec10.orig/lib/idr.c
+++ mmotm-2.6.28-Dec10/lib/idr.c
@@ -573,6 +573,52 @@ int idr_for_each(struct idr *idp,
EXPORT_SYMBOL(idr_for_each);

/**
+ * idr_get_next - lookup next object of id to given id.
+ * @idp: idr handle
+ * @id: pointer to lookup key
+ *
+ * Returns pointer to registered object with id, which is next number to
+ * given id.
+ */
+
+void *idr_get_next(struct idr *idp, int *nextidp)
+{
+ struct idr_layer *p, *pa[MAX_LEVEL];
+ struct idr_layer **paa = &pa[0];
+ int id = *nextidp;
+ int n, max;
+
+ /* find first ent */
+ n = idp->layers * IDR_BITS;
+ max = 1 << n;
+ p = rcu_dereference(idp->top);
+ if (!p)
+ return NULL;
+
+ while (id < max) {
+ while (n > 0 && p) {
+ n -= IDR_BITS;
+ *paa++ = p;
+ p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
+ }
+
+ if (p) {
+ *nextidp = id;
+ return p;
+ }
+
+ id += 1 << n;
+ while (n < fls(id)) {
+ n += IDR_BITS;
+ p = *--paa;
+ }
+ }
+ return NULL;
+}
+
+
+
+/**
* idr_replace - replace pointer for given id
* @idp: idr handle
* @ptr: pointer you want associated with the id