2008-02-25 12:01:17

by Balbir Singh

[permalink] [raw]
Subject: [PATCH] Memory controller rename to Memory Resource Controller



Rename Memory Controller to Memory Resource Controller. Reflect the same
changes in the CONFIG definition for the Memory Resource Controller.
Group together the config options for Resource Counters and Memory
Resource Controller.

This code has been compile tested with the Memory Resource Controller on and off.

Signed-off-by: Balbir Singh <[email protected]>
---

Documentation/controllers/memory.txt | 8 ++++++--
include/linux/cgroup_subsys.h | 2 +-
include/linux/memcontrol.h | 4 ++--
include/linux/mm_types.h | 4 ++--
init/Kconfig | 30 +++++++++++++++---------------
mm/Makefile | 2 +-
mm/oom_kill.c | 2 +-
mm/vmscan.c | 4 ++--
8 files changed, 30 insertions(+), 26 deletions(-)

diff -puN Documentation/controllers/memory.txt~memory-controller-naming-fixes Documentation/controllers/memory.txt
--- linux-2.6.25-rc3/Documentation/controllers/memory.txt~memory-controller-naming-fixes 2008-02-25 14:22:36.000000000 +0530
+++ linux-2.6.25-rc3-balbir/Documentation/controllers/memory.txt 2008-02-25 14:53:49.000000000 +0530
@@ -1,4 +1,8 @@
-Memory Controller
+Memory Resource Controller
+
+NOTE: The Memory Resource Controller has been generically been referred
+to as the memory controller in this document. Do not confuse memory controller
+used here with the memory controller that is used in hardware.

Salient features

@@ -152,7 +156,7 @@ The memory controller uses the following

a. Enable CONFIG_CGROUPS
b. Enable CONFIG_RESOURCE_COUNTERS
-c. Enable CONFIG_CGROUP_MEM_CONT
+c. Enable CONFIG_CGROUP_MEM_RES_CTLR

1. Prepare the cgroups
# mkdir -p /cgroups
diff -puN init/Kconfig~memory-controller-naming-fixes init/Kconfig
--- linux-2.6.25-rc3/init/Kconfig~memory-controller-naming-fixes 2008-02-25 14:22:36.000000000 +0530
+++ linux-2.6.25-rc3-balbir/init/Kconfig 2008-02-25 15:03:43.000000000 +0530
@@ -366,6 +366,21 @@ config RESOURCE_COUNTERS
infrastructure that works with cgroups
depends on CGROUPS

+config CGROUP_MEM_RES_CTLR
+ bool "Memory Resource Controller for Control Groups"
+ depends on CGROUPS && RESOURCE_COUNTERS
+ help
+ Provides a memory resource controller that manages both page cache and
+ RSS memory.
+
+ Note that setting this option increases fixed memory overhead
+ associated with each page of memory in the system by 4/8 bytes
+ and also increases cache misses because struct page on many 64bit
+ systems will not fit into a single cache line anymore.
+
+ Only enable when you're ok with these trade offs and really
+ sure you need the memory resource controller.
+
config SYSFS_DEPRECATED
bool "Create deprecated sysfs files"
depends on SYSFS
@@ -387,21 +402,6 @@ config SYSFS_DEPRECATED
If you are using a distro that was released in 2006 or later,
it should be safe to say N here.

-config CGROUP_MEM_CONT
- bool "Memory controller for cgroups"
- depends on CGROUPS && RESOURCE_COUNTERS
- help
- Provides a memory controller that manages both page cache and
- RSS memory.
-
- Note that setting this option increases fixed memory overhead
- associated with each page of memory in the system by 4/8 bytes
- and also increases cache misses because struct page on many 64bit
- systems will not fit into a single cache line anymore.
-
- Only enable when you're ok with these trade offs and really
- sure you need the memory controller.
-
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
diff -puN include/linux/memcontrol.h~memory-controller-naming-fixes include/linux/memcontrol.h
--- linux-2.6.25-rc3/include/linux/memcontrol.h~memory-controller-naming-fixes 2008-02-25 14:22:36.000000000 +0530
+++ linux-2.6.25-rc3-balbir/include/linux/memcontrol.h 2008-02-25 15:55:44.000000000 +0530
@@ -25,7 +25,7 @@ struct page_cgroup;
struct page;
struct mm_struct;

-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR

extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
extern void mm_free_cgroup(struct mm_struct *mm);
@@ -72,7 +72,7 @@ extern long mem_cgroup_calc_reclaim_acti
extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
struct zone *zone, int priority);

-#else /* CONFIG_CGROUP_MEM_CONT */
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
static inline void mm_init_cgroup(struct mm_struct *mm,
struct task_struct *p)
{
diff -puN mm/memcontrol.c~memory-controller-naming-fixes mm/memcontrol.c
diff -puN mm/vmscan.c~memory-controller-naming-fixes mm/vmscan.c
--- linux-2.6.25-rc3/mm/vmscan.c~memory-controller-naming-fixes 2008-02-25 14:22:36.000000000 +0530
+++ linux-2.6.25-rc3-balbir/mm/vmscan.c 2008-02-25 14:32:55.000000000 +0530
@@ -126,7 +126,7 @@ long vm_total_pages; /* The total number
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
#define scan_global_lru(sc) (!(sc)->mem_cgroup)
#else
#define scan_global_lru(sc) (1)
@@ -1427,7 +1427,7 @@ unsigned long try_to_free_pages(struct z
return do_try_to_free_pages(zones, gfp_mask, &sc);
}

-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
gfp_t gfp_mask)
diff -puN mm/rmap.c~memory-controller-naming-fixes mm/rmap.c
diff -puN mm/Makefile~memory-controller-naming-fixes mm/Makefile
--- linux-2.6.25-rc3/mm/Makefile~memory-controller-naming-fixes 2008-02-25 14:22:36.000000000 +0530
+++ linux-2.6.25-rc3-balbir/mm/Makefile 2008-02-25 14:33:10.000000000 +0530
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o

diff -puN include/linux/cgroup_subsys.h~memory-controller-naming-fixes include/linux/cgroup_subsys.h
--- linux-2.6.25-rc3/include/linux/cgroup_subsys.h~memory-controller-naming-fixes 2008-02-25 14:31:04.000000000 +0530
+++ linux-2.6.25-rc3-balbir/include/linux/cgroup_subsys.h 2008-02-25 14:31:18.000000000 +0530
@@ -37,7 +37,7 @@ SUBSYS(cpuacct)

/* */

-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
SUBSYS(mem_cgroup)
#endif

diff -puN include/linux/mm_types.h~memory-controller-naming-fixes include/linux/mm_types.h
--- linux-2.6.25-rc3/include/linux/mm_types.h~memory-controller-naming-fixes 2008-02-25 14:32:18.000000000 +0530
+++ linux-2.6.25-rc3-balbir/include/linux/mm_types.h 2008-02-25 15:13:53.000000000 +0530
@@ -91,7 +91,7 @@ struct page {
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
unsigned long page_cgroup;
#endif
};
@@ -225,7 +225,7 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
struct mem_cgroup *mem_cgroup;
#endif
};
diff -puN mm/oom_kill.c~memory-controller-naming-fixes mm/oom_kill.c
--- linux-2.6.25-rc3/mm/oom_kill.c~memory-controller-naming-fixes 2008-02-25 14:33:17.000000000 +0530
+++ linux-2.6.25-rc3-balbir/mm/oom_kill.c 2008-02-25 14:33:27.000000000 +0530
@@ -412,7 +412,7 @@ static int oom_kill_process(struct task_
return oom_kill_task(p);
}

-#ifdef CONFIG_CGROUP_MEM_CONT
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
unsigned long points = 0;
_

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL


2008-02-25 12:01:42

by Balbir Singh

[permalink] [raw]
Subject: [PATCH] Memory Resource Controller Add Boot Option



A boot option for the memory controller was discussed on lkml. It is a good
idea to add it, since it saves memory for people who want to turn off the
memory controller.

By default the option is on for the following two reasons

1. It provides compatibility with the current scheme where the memory
controller turns on if the config option is enabled
2. It allows for wider testing of the memory controller, once the config
option is enabled

We still allow the create, destroy callbacks to succeed, since they are
not aware of boot options. We do not populate the directory will
memory resource controller specific files.

Signed-off-by: Balbir Singh <[email protected]>
---

Documentation/kernel-parameters.txt | 2 ++
mm/memcontrol.c | 33 ++++++++++++++++++++++++++++++++-
2 files changed, 34 insertions(+), 1 deletion(-)

diff -L mm/memcontrol.h -puN /dev/null /dev/null
diff -puN include/linux/memcontrol.h~memory-controller-add-boot-option include/linux/memcontrol.h
diff -puN mm/memcontrol.c~memory-controller-add-boot-option mm/memcontrol.c
--- linux-2.6.25-rc3/mm/memcontrol.c~memory-controller-add-boot-option 2008-02-25 15:55:58.000000000 +0530
+++ linux-2.6.25-rc3-balbir/mm/memcontrol.c 2008-02-25 17:10:50.000000000 +0530
@@ -35,6 +35,7 @@

struct cgroup_subsys mem_cgroup_subsys;
static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+static int mem_cgroup_on __read_mostly = 1; /* turned on/off */

/*
* Statistics for memory cgroup.
@@ -578,6 +579,9 @@ static int mem_cgroup_charge_common(stru
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;

+ if (!mem_cgroup_on)
+ return 0;
+
/*
* Should page_cgroup's go to their own slab?
* One could optimize the performance of the charging routine
@@ -729,7 +733,7 @@ void mem_cgroup_uncharge(struct page_cgr
/*
* Check if our page_cgroup is valid
*/
- if (!pc)
+ if (!pc || !mem_cgroup_on)
return;

if (atomic_dec_and_test(&pc->ref_cnt)) {
@@ -755,6 +759,9 @@ void mem_cgroup_uncharge(struct page_cgr

void mem_cgroup_uncharge_page(struct page *page)
{
+ if (!mem_cgroup_on)
+ return;
+
lock_page_cgroup(page);
mem_cgroup_uncharge(page_get_page_cgroup(page));
unlock_page_cgroup(page);
@@ -769,6 +776,10 @@ int mem_cgroup_prepare_migration(struct
{
struct page_cgroup *pc;
int ret = 0;
+
+ if (!mem_cgroup_on)
+ return 0;
+
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
if (pc && atomic_inc_not_zero(&pc->ref_cnt))
@@ -781,6 +792,9 @@ void mem_cgroup_end_migration(struct pag
{
struct page_cgroup *pc;

+ if (!mem_cgroup_on)
+ return;
+
lock_page_cgroup(page);
pc = page_get_page_cgroup(page);
mem_cgroup_uncharge(pc);
@@ -881,6 +895,10 @@ int mem_cgroup_force_empty(struct mem_cg
{
int ret = -EBUSY;
int node, zid;
+
+ if (!mem_cgroup_on)
+ return 0;
+
css_get(&mem->css);
/*
* page reclaim code (kswapd etc..) will move pages between
@@ -1141,6 +1159,9 @@ static void mem_cgroup_destroy(struct cg
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *cont)
{
+ if (!mem_cgroup_on)
+ return 0;
+
return cgroup_add_files(cont, ss, mem_cgroup_files,
ARRAY_SIZE(mem_cgroup_files));
}
@@ -1153,6 +1174,9 @@ static void mem_cgroup_move_task(struct
struct mm_struct *mm;
struct mem_cgroup *mem, *old_mem;

+ if (!mem_cgroup_on)
+ return;
+
mm = get_task_mm(p);
if (mm == NULL)
return;
@@ -1189,3 +1213,10 @@ struct cgroup_subsys mem_cgroup_subsys =
.attach = mem_cgroup_move_task,
.early_init = 0,
};
+
+static int __init mem_cgroup_startup_disable(char *str)
+{
+ mem_cgroup_on = 0;
+ return 1;
+}
+__setup("memcgroupoff", mem_cgroup_startup_disable);
diff -puN Documentation/kernel-parameters.txt~memory-controller-add-boot-option Documentation/kernel-parameters.txt
--- linux-2.6.25-rc3/Documentation/kernel-parameters.txt~memory-controller-add-boot-option 2008-02-25 15:55:58.000000000 +0530
+++ linux-2.6.25-rc3-balbir/Documentation/kernel-parameters.txt 2008-02-25 15:56:01.000000000 +0530
@@ -1114,6 +1114,8 @@ and is between 256 and 4096 characters.
mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.

+ memcgroupoff [KNL] Disable memory resource controller
+
memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
_

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2008-02-25 16:17:18

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

On Mon, Feb 25, 2008 at 3:55 AM, Balbir Singh <[email protected]> wrote:
>
>
> A boot option for the memory controller was discussed on lkml. It is a good
> idea to add it, since it saves memory for people who want to turn off the
> memory controller.
>
> By default the option is on for the following two reasons
>
> 1. It provides compatibility with the current scheme where the memory
> controller turns on if the config option is enabled
> 2. It allows for wider testing of the memory controller, once the config
> option is enabled
>
> We still allow the create, destroy callbacks to succeed, since they are
> not aware of boot options. We do not populate the directory will
> memory resource controller specific files.

Would it make more sense to have a generic cgroups boot option for this?

Something like cgroup_disable=xxx, which would be parsed by cgroups
and would cause:

- a "disabled" flag to be set to true in the subsys object (you could
use this in place of the mem_cgroup_on flag)

- prevent the disabled cgroup from being bound to any mounted
hierarchy (so it would be ignored in a mount with no subsystem
options, and a mount with options that specifically pick that
subsystem would give an error)

Paul

2008-02-25 17:24:14

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Paul Menage wrote:
> On Mon, Feb 25, 2008 at 3:55 AM, Balbir Singh <[email protected]> wrote:
>>
>> A boot option for the memory controller was discussed on lkml. It is a good
>> idea to add it, since it saves memory for people who want to turn off the
>> memory controller.
>>
>> By default the option is on for the following two reasons
>>
>> 1. It provides compatibility with the current scheme where the memory
>> controller turns on if the config option is enabled
>> 2. It allows for wider testing of the memory controller, once the config
>> option is enabled
>>
>> We still allow the create, destroy callbacks to succeed, since they are
>> not aware of boot options. We do not populate the directory will
>> memory resource controller specific files.
>
> Would it make more sense to have a generic cgroups boot option for this?
>
> Something like cgroup_disable=xxx, which would be parsed by cgroups
> and would cause:
>
> - a "disabled" flag to be set to true in the subsys object (you could
> use this in place of the mem_cgroup_on flag)
>

I thought about it, but it did not work out all that well. The reason being,
that the memory controller is called in from places besides cgroup.
mem_cgroup_charge_common() for example is called from several places in mm.
Calling into cgroups to check, enabled/disabled did not seem right.

Hence I put the boot option in mm/memcontrol.c

> - prevent the disabled cgroup from being bound to any mounted
> hierarchy (so it would be ignored in a mount with no subsystem
> options, and a mount with options that specifically pick that
> subsystem would give an error)
>

The controller can be bound, but I just don't populate the files associated with
the controller

> Paul


--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2008-02-25 17:32:20

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

On Mon, Feb 25, 2008 at 9:18 AM, Balbir Singh <[email protected]> wrote:
>
> I thought about it, but it did not work out all that well. The reason being,
> that the memory controller is called in from places besides cgroup.
> mem_cgroup_charge_common() for example is called from several places in mm.
> Calling into cgroups to check, enabled/disabled did not seem right.

You wouldn't need to call into cgroups - if it's a flag in the subsys
object (which is defined in memcontrol.c) you'd just say

if (mem_cgroup_subsys.disabled) {
...
}

I'll send out a prototype for comment.

Paul

2008-02-25 17:42:40

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Paul Menage wrote:
> On Mon, Feb 25, 2008 at 9:18 AM, Balbir Singh <[email protected]> wrote:
>> I thought about it, but it did not work out all that well. The reason being,
>> that the memory controller is called in from places besides cgroup.
>> mem_cgroup_charge_common() for example is called from several places in mm.
>> Calling into cgroups to check, enabled/disabled did not seem right.
>
> You wouldn't need to call into cgroups - if it's a flag in the subsys
> object (which is defined in memcontrol.c) you'd just say
>
> if (mem_cgroup_subsys.disabled) {
> ...
> }
>
> I'll send out a prototype for comment.

Sure thing, if css has the flag, then it would nice. Could you wrap it up to say
something like css_disabled(&mem_cgroup_subsys)


--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL

2008-02-25 18:54:49

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

>> I'll send out a prototype for comment.

Something like the patch below. The effects of cgroup_disable=foo are:

- foo doesn't show up in /proc/cgroups
- foo isn't auto-mounted if you mount all cgroups in a single hierarchy
- foo isn't visible as an individually mountable subsystem

As a result there will only ever be one call to foo->create(), at init time; all processes will stay in this group, and the group will never be mounted on a visible hierarchy. Any additional effects (e.g. not allocating metadata) are up to the foo subsystem.

This doesn't handle early_init subsystems (their "disabled" bit isn't set be, but it could easily be extended to do so if any of the early_init systems wanted it - I think it would just involve some nastier parameter processing since it would occur before the command-line argument parser had been run.

include/linux/cgroup.h | 1 +
kernel/cgroup.c | 29 +++++++++++++++++++++++++++--
2 files changed, 28 insertions(+), 2 deletions(-)

Index: cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
===================================================================
--- cgroup_disable-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -256,6 +256,7 @@ struct cgroup_subsys {
void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
int subsys_id;
int active;
+ int disabled;
int early_init;
#define MAX_CGROUP_TYPE_NAMELEN 32
const char *name;
Index: cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
===================================================================
--- cgroup_disable-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -790,7 +790,14 @@ static int parse_cgroupfs_options(char *
if (!*token)
return -EINVAL;
if (!strcmp(token, "all")) {
- opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
+ /* Add all non-disabled subsystems */
+ int i;
+ opts->subsys_bits = 0;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (!ss->disabled)
+ opts->subsys_bits |= 1ul << i;
+ }
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
} else if (!strncmp(token, "release_agent=", 14)) {
@@ -808,7 +815,8 @@ static int parse_cgroupfs_options(char *
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i];
if (!strcmp(token, ss->name)) {
- set_bit(i, &opts->subsys_bits);
+ if (!ss->disabled)
+ set_bit(i, &opts->subsys_bits);
break;
}
}
@@ -2596,6 +2606,8 @@ static int proc_cgroupstats_show(struct
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss->disabled)
+ continue;
seq_printf(m, "%s\t%lu\t%d\n",
ss->name, ss->root->subsys_bits,
ss->root->number_of_cgroups);
@@ -2991,3 +3003,16 @@ static void cgroup_release_agent(struct
spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
+
+static int __init cgroup_disable(char *str)
+{
+ int i;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (!strcmp(str, ss->name)) {
+ ss->disabled = 1;
+ break;
+ }
+ }
+}
+__setup("cgroup_disable=", cgroup_disable);


>
> Sure thing, if css has the flag, then it would nice. Could you wrap it up to say
> something like css_disabled(&mem_cgroup_subsys)
>
>

It's the subsys object rather than the css (cgroup_subsys_state).

We could have something like:

#define cgroup_subsys_disabled(_ss) ((ss_)->disabled)

but I don't see that

cgroup_subsys_disabled(&mem_cgroup_subsys)

is better than just putting

mem_cgroup_subsys.disabled

Paul

2008-02-26 03:03:51

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Paul Menage wrote:
>>> I'll send out a prototype for comment.
>
> Something like the patch below. The effects of cgroup_disable=foo are:
>
> - foo doesn't show up in /proc/cgroups

Or we can print out the disable flag, maybe this will be better?
Because we can distinguish from disabled and not compiled in from
/proc/cgroups.

> - foo isn't auto-mounted if you mount all cgroups in a single hierarchy
> - foo isn't visible as an individually mountable subsystem

You mentioned in a previous mail if we mount a disabled subsystem we
will get an error. Here we just ignore the mount option. Which makes
more sense ?

>
> As a result there will only ever be one call to foo->create(), at init
> time; all processes will stay in this group, and the group will never be
> mounted on a visible hierarchy. Any additional effects (e.g. not
> allocating metadata) are up to the foo subsystem.
>
> This doesn't handle early_init subsystems (their "disabled" bit isn't
> set be, but it could easily be extended to do so if any of the
> early_init systems wanted it - I think it would just involve some
> nastier parameter processing since it would occur before the
> command-line argument parser had been run.
>
> include/linux/cgroup.h | 1 +
> kernel/cgroup.c | 29 +++++++++++++++++++++++++++--
> 2 files changed, 28 insertions(+), 2 deletions(-)
>
> Index: cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> ===================================================================
> --- cgroup_disable-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
> +++ cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> @@ -256,6 +256,7 @@ struct cgroup_subsys {
> void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
> int subsys_id;
> int active;
> + int disabled;
> int early_init;
> #define MAX_CGROUP_TYPE_NAMELEN 32
> const char *name;
> Index: cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> ===================================================================
> --- cgroup_disable-2.6.25-rc2-mm1.orig/kernel/cgroup.c
> +++ cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> @@ -790,7 +790,14 @@ static int parse_cgroupfs_options(char *
> if (!*token)
> return -EINVAL;
> if (!strcmp(token, "all")) {
> - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
> + /* Add all non-disabled subsystems */
> + int i;
> + opts->subsys_bits = 0;
> + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (!ss->disabled)
> + opts->subsys_bits |= 1ul << i;
> + }
> } else if (!strcmp(token, "noprefix")) {
> set_bit(ROOT_NOPREFIX, &opts->flags);
> } else if (!strncmp(token, "release_agent=", 14)) {
> @@ -808,7 +815,8 @@ static int parse_cgroupfs_options(char *
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> ss = subsys[i];
> if (!strcmp(token, ss->name)) {
> - set_bit(i, &opts->subsys_bits);
> + if (!ss->disabled)
> + set_bit(i, &opts->subsys_bits);
> break;
> }
> }
> @@ -2596,6 +2606,8 @@ static int proc_cgroupstats_show(struct
> mutex_lock(&cgroup_mutex);
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> + if (ss->disabled)
> + continue;
> seq_printf(m, "%s\t%lu\t%d\n",
> ss->name, ss->root->subsys_bits,
> ss->root->number_of_cgroups);
> @@ -2991,3 +3003,16 @@ static void cgroup_release_agent(struct
> spin_unlock(&release_list_lock);
> mutex_unlock(&cgroup_mutex);
> }
> +
> +static int __init cgroup_disable(char *str)
> +{
> + int i;
> + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (!strcmp(str, ss->name)) {
> + ss->disabled = 1;
> + break;
> + }
> + }
> +}
> +__setup("cgroup_disable=", cgroup_disable);
>
>
>>
>> Sure thing, if css has the flag, then it would nice. Could you wrap it
>> up to say
>> something like css_disabled(&mem_cgroup_subsys)
>>
>>
>
> It's the subsys object rather than the css (cgroup_subsys_state).
>
> We could have something like:
>
> #define cgroup_subsys_disabled(_ss) ((ss_)->disabled)
>
> but I don't see that
> cgroup_subsys_disabled(&mem_cgroup_subsys)
> is better than just putting
>
> mem_cgroup_subsys.disabled
>
> Paul
>
>

2008-02-26 06:59:59

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Hi,

> >>> I'll send out a prototype for comment.
> >
> > Something like the patch below. The effects of cgroup_disable=foo are:
> >
> > - foo doesn't show up in /proc/cgroups
>
> Or we can print out the disable flag, maybe this will be better?
> Because we can distinguish from disabled and not compiled in from
> /proc/cgroups.

It would be neat if the disable flag /proc/cgroups can be cleared/set
on demand. It will depend on the implementation of each controller
whether it works or not.

> > - foo isn't auto-mounted if you mount all cgroups in a single hierarchy
> > - foo isn't visible as an individually mountable subsystem
>
> You mentioned in a previous mail if we mount a disabled subsystem we
> will get an error. Here we just ignore the mount option. Which makes
> more sense ?
>
> >
> > As a result there will only ever be one call to foo->create(), at init
> > time; all processes will stay in this group, and the group will never be
> > mounted on a visible hierarchy. Any additional effects (e.g. not
> > allocating metadata) are up to the foo subsystem.
> >
> > This doesn't handle early_init subsystems (their "disabled" bit isn't
> > set be, but it could easily be extended to do so if any of the
> > early_init systems wanted it - I think it would just involve some
> > nastier parameter processing since it would occur before the
> > command-line argument parser had been run.
> >
> > include/linux/cgroup.h | 1 +
> > kernel/cgroup.c | 29 +++++++++++++++++++++++++++--
> > 2 files changed, 28 insertions(+), 2 deletions(-)
> >
> > Index: cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> > ===================================================================
> > --- cgroup_disable-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
> > +++ cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> > @@ -256,6 +256,7 @@ struct cgroup_subsys {
> > void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
> > int subsys_id;
> > int active;
> > + int disabled;
> > int early_init;
> > #define MAX_CGROUP_TYPE_NAMELEN 32
> > const char *name;
> > Index: cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> > ===================================================================
> > --- cgroup_disable-2.6.25-rc2-mm1.orig/kernel/cgroup.c
> > +++ cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> > @@ -790,7 +790,14 @@ static int parse_cgroupfs_options(char *
> > if (!*token)
> > return -EINVAL;
> > if (!strcmp(token, "all")) {
> > - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
> > + /* Add all non-disabled subsystems */
> > + int i;
> > + opts->subsys_bits = 0;
> > + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> > + struct cgroup_subsys *ss = subsys[i];
> > + if (!ss->disabled)
> > + opts->subsys_bits |= 1ul << i;
> > + }
> > } else if (!strcmp(token, "noprefix")) {
> > set_bit(ROOT_NOPREFIX, &opts->flags);
> > } else if (!strncmp(token, "release_agent=", 14)) {
> > @@ -808,7 +815,8 @@ static int parse_cgroupfs_options(char *
> > for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> > ss = subsys[i];
> > if (!strcmp(token, ss->name)) {
> > - set_bit(i, &opts->subsys_bits);
> > + if (!ss->disabled)
> > + set_bit(i, &opts->subsys_bits);
> > break;
> > }
> > }
> > @@ -2596,6 +2606,8 @@ static int proc_cgroupstats_show(struct
> > mutex_lock(&cgroup_mutex);
> > for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> > struct cgroup_subsys *ss = subsys[i];
> > + if (ss->disabled)
> > + continue;
> > seq_printf(m, "%s\t%lu\t%d\n",
> > ss->name, ss->root->subsys_bits,
> > ss->root->number_of_cgroups);
> > @@ -2991,3 +3003,16 @@ static void cgroup_release_agent(struct
> > spin_unlock(&release_list_lock);
> > mutex_unlock(&cgroup_mutex);
> > }
> > +
> > +static int __init cgroup_disable(char *str)
> > +{
> > + int i;
> > + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> > + struct cgroup_subsys *ss = subsys[i];
> > + if (!strcmp(str, ss->name)) {
> > + ss->disabled = 1;
> > + break;
> > + }
> > + }
> > +}
> > +__setup("cgroup_disable=", cgroup_disable);
> >
> >
> >>
> >> Sure thing, if css has the flag, then it would nice. Could you wrap it
> >> up to say
> >> something like css_disabled(&mem_cgroup_subsys)
> >>
> >>
> >
> > It's the subsys object rather than the css (cgroup_subsys_state).
> >
> > We could have something like:
> >
> > #define cgroup_subsys_disabled(_ss) ((ss_)->disabled)
> >
> > but I don't see that
> > cgroup_subsys_disabled(&mem_cgroup_subsys)
> > is better than just putting
> >
> > mem_cgroup_subsys.disabled
> >
> > Paul
> >
> >
>
>

2008-02-26 08:58:38

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

On Mon, Feb 25, 2008 at 7:01 PM, Li Zefan <[email protected]> wrote:
> >
> > - foo doesn't show up in /proc/cgroups
>
> Or we can print out the disable flag, maybe this will be better?
> Because we can distinguish from disabled and not compiled in from
>
> /proc/cgroups.

Certainly possible, if people felt it was useful.

>
> > - foo isn't auto-mounted if you mount all cgroups in a single hierarchy
> > - foo isn't visible as an individually mountable subsystem
>
> You mentioned in a previous mail if we mount a disabled subsystem we
> will get an error. Here we just ignore the mount option. Which makes
> more sense ?
>

No, we don't ignore the mount option - we give an error since it
doesn't refer to a valid subsystem. (And in the first case there is no
mount option).

Paul

2008-02-26 09:07:40

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Paul Menage wrote:
> On Mon, Feb 25, 2008 at 7:01 PM, Li Zefan <[email protected]> wrote:
>> >
>> > - foo doesn't show up in /proc/cgroups
>>
>> Or we can print out the disable flag, maybe this will be better?
>> Because we can distinguish from disabled and not compiled in from
>>
>> /proc/cgroups.
>
> Certainly possible, if people felt it was useful.
>
>> > - foo isn't auto-mounted if you mount all cgroups in a single hierarchy
>> > - foo isn't visible as an individually mountable subsystem
>>
>> You mentioned in a previous mail if we mount a disabled subsystem we
>> will get an error. Here we just ignore the mount option. Which makes
>> more sense ?
>>
>
> No, we don't ignore the mount option - we give an error since it
> doesn't refer to a valid subsystem. (And in the first case there is no
> mount option).
>

You are write, -ENOENT will be returned in this case. I made a mistake when
reading the prototype patch, thanks for the clarification. :)

2008-03-05 16:12:41

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH] Memory Resource Controller Add Boot Option

Paul Menage wrote:
>>> I'll send out a prototype for comment.
>
> Something like the patch below. The effects of cgroup_disable=foo are:
>
> - foo doesn't show up in /proc/cgroups
> - foo isn't auto-mounted if you mount all cgroups in a single hierarchy
> - foo isn't visible as an individually mountable subsystem
>
> As a result there will only ever be one call to foo->create(), at init
> time; all processes will stay in this group, and the group will never be
> mounted on a visible hierarchy. Any additional effects (e.g. not
> allocating metadata) are up to the foo subsystem.
>
> This doesn't handle early_init subsystems (their "disabled" bit isn't
> set be, but it could easily be extended to do so if any of the
> early_init systems wanted it - I think it would just involve some
> nastier parameter processing since it would occur before the
> command-line argument parser had been run.
>
> include/linux/cgroup.h | 1 +
> kernel/cgroup.c | 29 +++++++++++++++++++++++++++--
> 2 files changed, 28 insertions(+), 2 deletions(-)
>
> Index: cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> ===================================================================
> --- cgroup_disable-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
> +++ cgroup_disable-2.6.25-rc2-mm1/include/linux/cgroup.h
> @@ -256,6 +256,7 @@ struct cgroup_subsys {
> void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
> int subsys_id;
> int active;
> + int disabled;
> int early_init;
> #define MAX_CGROUP_TYPE_NAMELEN 32
> const char *name;
> Index: cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> ===================================================================
> --- cgroup_disable-2.6.25-rc2-mm1.orig/kernel/cgroup.c
> +++ cgroup_disable-2.6.25-rc2-mm1/kernel/cgroup.c
> @@ -790,7 +790,14 @@ static int parse_cgroupfs_options(char *
> if (!*token)
> return -EINVAL;
> if (!strcmp(token, "all")) {
> - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
> + /* Add all non-disabled subsystems */
> + int i;
> + opts->subsys_bits = 0;
> + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (!ss->disabled)
> + opts->subsys_bits |= 1ul << i;
> + }
> } else if (!strcmp(token, "noprefix")) {
> set_bit(ROOT_NOPREFIX, &opts->flags);
> } else if (!strncmp(token, "release_agent=", 14)) {
> @@ -808,7 +815,8 @@ static int parse_cgroupfs_options(char *
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> ss = subsys[i];
> if (!strcmp(token, ss->name)) {
> - set_bit(i, &opts->subsys_bits);
> + if (!ss->disabled)
> + set_bit(i, &opts->subsys_bits);
> break;
> }
> }
> @@ -2596,6 +2606,8 @@ static int proc_cgroupstats_show(struct
> mutex_lock(&cgroup_mutex);
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> + if (ss->disabled)
> + continue;
> seq_printf(m, "%s\t%lu\t%d\n",
> ss->name, ss->root->subsys_bits,
> ss->root->number_of_cgroups);
> @@ -2991,3 +3003,16 @@ static void cgroup_release_agent(struct
> spin_unlock(&release_list_lock);
> mutex_unlock(&cgroup_mutex);
> }
> +
> +static int __init cgroup_disable(char *str)
> +{
> + int i;
> + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + if (!strcmp(str, ss->name)) {
> + ss->disabled = 1;
> + break;
> + }
> + }
> +}
> +__setup("cgroup_disable=", cgroup_disable);
>

Hi, Paul,

I am going to go ahead and test this patch. If they work fine, I'll request you
to send them out, so that we can get them in by 2.6.25.

--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL