This patch series implements simple support for building and loading
subsystems as modules, both within and outside the kernel source tree.
Module unloading is as yet unimplemented - it will require more advanced
reference counting in the mount/unmount code, and I plan to work on this
soon.
Patch #1 sets up the subsys[] array so its contents can be dynamic as
modules appear and (eventually) disappear. I introduce an rwsem called
subsys_mutex to protect against concurrent loads/unloads/reads, and
modify iterations over the array to handle when subsystems are absent.
Patch #2 implements an interface for modules to load subsystems, called
cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module
pointer in struct cgroup_subsys.
Patch #3 modifies the net_cls subsystem, which already had some module
declarations, to be configurable as a module, which also serves as a
simple proof-of-concept.
-- bblum
Make subsys[] able to be dynamically populated to support modular subsystems
From: Ben Blum <[email protected]>
This patch reworks the way the subsys[] array is used so that subsystems can
register themselves after boot time, and enables the internals of cgroups to
be able to handle when subsystems are not present or may appear/disappear.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/cgroup.h | 8 +++-
kernel/cgroup.c | 91 ++++++++++++++++++++++++++++++++++++++++--------
2 files changed, 81 insertions(+), 18 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 642a47f..d7f1545 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -41,13 +41,17 @@ extern int cgroupstats_build(struct cgroupstats *stats,
extern struct file_operations proc_cgroup_operations;
-/* Define the enumeration of all cgroup subsystems */
+/* Define the enumeration of all builtin cgroup subsystems */
#define SUBSYS(_x) _x ## _subsys_id,
enum cgroup_subsys_id {
#include <linux/cgroup_subsys.h>
- CGROUP_SUBSYS_COUNT
+ CGROUP_BUILTIN_SUBSYS_COUNT
};
#undef SUBSYS
+/* This define indicates the maximum number of subsystems that can be loaded
+ * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
+ * track of all of them. */
+#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
/* Per-subsystem/per-cgroup state maintained by the system. */
struct cgroup_subsys_state {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7af6168..9966dfe 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -58,10 +58,15 @@ static DEFINE_MUTEX(cgroup_mutex);
/* Generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
+/*
+ * this lock protects the dynamic section of the array, in which modular
+ * subsystems are loaded. nests outside of cgroup_mutex, because of both
+ * cgroup_get_sb and cgroup_load_subsys.
+ */
+static DECLARE_RWSEM(subsys_mutex);
#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -433,8 +438,9 @@ static struct css_set *find_existing_css_set(
struct hlist_node *node;
struct css_set *cg;
- /* Built the set of subsystem state objects that we want to
- * see in the new css_set */
+ /* Built the set of subsystem state objects that we want to see in the
+ * new css_set. while subsystems can change globally, the entries here
+ * won't change, so no need for subsys_mutex. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
if (root->subsys_bits & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
@@ -869,7 +875,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
css_put(css);
}
-
+/*
+ * Call with subsys_mutex held - see parse_cgroupfs_options.
+ */
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -877,6 +885,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
struct cgroup *cgrp = &root->top_cgroup;
int i;
+ BUG_ON(!rwsem_is_locked(&subsys_mutex));
+
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
/* Check that any added subsystems are currently free */
@@ -885,6 +895,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
+ /* nobody should tell us to do a subsys that doesn't exist:
+ * parse_cgroupfs_options should catch that case (subsys_mutex
+ * prevents state from changing in between), and the only time
+ * we're called without that beforehand is when the bitmask is
+ * 0 in cgroup_kill_sb. */
+ BUG_ON(ss == NULL);
if (ss->root != &rootnode) {
/* Subsystem isn't free */
return -EBUSY;
@@ -904,6 +920,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long bit = 1UL << i;
if (bit & added_bits) {
/* We're binding this subsystem to this hierarchy */
+ BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +932,11 @@ static int rebind_subsystems(struct cgroupfs_root *root,
if (ss->bind)
ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex);
+ /* TODO: If subsystem unloading support, need to take
+ * a reference on the subsystem here. */
} else if (bit & removed_bits) {
/* We're removing this subsystem */
+ BUG_ON(ss == NULL);
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
@@ -927,8 +947,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list);
mutex_unlock(&ss->hierarchy_mutex);
+ /* TODO: If subsystem unloading support, drop refcount */
} else if (bit & final_bits) {
/* Subsystem state should already exist */
+ BUG_ON(ss == NULL);
BUG_ON(!cgrp->subsys[i]);
} else {
/* Subsystem state shouldn't exist */
@@ -971,14 +993,18 @@ struct cgroup_sb_opts {
};
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
+/*
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * with subsys_mutex held to guard subsys[] between us and rebind_subsystems.
+ */
static int parse_cgroupfs_options(char *data,
struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
unsigned long mask = (unsigned long)-1;
+ BUG_ON(!rwsem_is_locked(&subsys_mutex));
+
#ifdef CONFIG_CPUSETS
mask = ~(1UL << cpuset_subsys_id);
#endif
@@ -994,6 +1020,8 @@ static int parse_cgroupfs_options(char *data,
opts->subsys_bits = 0;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
}
@@ -1038,6 +1066,8 @@ static int parse_cgroupfs_options(char *data,
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (!strcmp(token, ss->name)) {
if (!ss->disabled)
set_bit(i, &opts->subsys_bits);
@@ -1084,6 +1114,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
lock_kernel();
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+ down_read(&subsys_mutex);
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
@@ -1116,6 +1147,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
kfree(opts.release_agent);
kfree(opts.name);
mutex_unlock(&cgroup_mutex);
+ up_read(&subsys_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
unlock_kernel();
return ret;
@@ -1291,6 +1323,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct cgroupfs_root *new_root;
/* First find the desired set of subsystems */
+ down_read(&subsys_mutex);
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_err;
@@ -1367,6 +1400,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
free_cg_links(&tmp_cg_links);
goto drop_new_super;
}
+ /* done with subsys stuff and no other failure case */
+ up_read(&subsys_mutex);
/* EBUSY should be the only error here */
BUG_ON(ret);
@@ -1415,6 +1450,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
drop_new_super:
deactivate_locked_super(sb);
out_err:
+ up_read(&subsys_mutex);
kfree(opts.release_agent);
kfree(opts.name);
@@ -1434,10 +1470,12 @@ static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(!list_empty(&cgrp->children));
BUG_ON(!list_empty(&cgrp->sibling));
+ down_read(&subsys_mutex);
mutex_lock(&cgroup_mutex);
/* Rebind all subsystems back to the default hierarchy */
ret = rebind_subsystems(root, 0);
+ up_read(&subsys_mutex);
/* Shouldn't be able to fail ... */
BUG_ON(ret);
@@ -3276,9 +3314,12 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
{
/* We need to take each hierarchy_mutex in a consistent order */
int i;
-
+ /* no worry about a race with rebind_subsystems that might mess up the
+ * locking order, since both parties are under cgroup_mutex. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (ss->root == root)
mutex_lock(&ss->hierarchy_mutex);
}
@@ -3287,9 +3328,10 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
{
int i;
-
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
if (ss->root == root)
mutex_unlock(&ss->hierarchy_mutex);
}
@@ -3410,11 +3452,14 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* synchronization other than RCU, and the subsystem linked
* list isn't RCU-safe */
int i;
+ /* we won't need to lock the subsys array, because the subsystems
+ * we're concerned about aren't going anywhere since our cgroup root
+ * has a reference on them. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
- /* Skip subsystems not in this hierarchy */
- if (ss->root != cgrp->root)
+ /* Skip subsystems not present or not in this hierarchy */
+ if (ss == NULL || ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/* When called from check_for_release() it's possible
@@ -3635,7 +3680,8 @@ int __init cgroup_init_early(void)
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&css_set_table[i]);
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* at bootup time, we don't worry about modular subsystems */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
BUG_ON(!ss->name);
@@ -3670,7 +3716,8 @@ int __init cgroup_init(void)
if (err)
return err;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* at bootup time, we don't worry about modular subsystems */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
@@ -3779,14 +3826,19 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /* ideally we don't want subsystems moving around while we do this */
+ down_read(&subsys_mutex);
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
+ up_read(&subsys_mutex);
return 0;
}
@@ -3841,7 +3893,10 @@ void cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* forkexit callbacks are only supported for builtin modules,
+ * and the builtin section of the subsys array is immutable,
+ * so we don't need the subsys_lock here. */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->fork)
ss->fork(ss, child);
@@ -3912,7 +3967,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
struct css_set *cg;
if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* modular subsystems can't use callbacks, so no need to take
+ * subsys_lock */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->exit)
ss->exit(ss, tsk);
@@ -4221,7 +4278,9 @@ static int __init cgroup_disable(char *str)
if (!*token)
continue;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /* cgroup_disable, being at boot time, can't know about module
+ * subsystems, so we don't worry about them. */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!strcmp(token, ss->name)) {
Add interface between cgroups subsystem management and module loading
From: Ben Blum <[email protected]>
This patch implements rudimentary module-loading support for cgroups - namely,
a cgroup_load_subsys (similar to cgroup_init_subsys) for use as a module
initcall, and a struct module pointer in struct cgroup_subsys. Support for
subsystem unloading not here, yet to be done (requires more advanced reference
counting, for one thing).
I tested compiling a subsystem outside of the kernel source tree, and it was
apparent that I wanted the EXPORT_SYMBOL for several functions, but I'm not
sure what else it might be good to put them on. Suggestions welcome.
Signed-off-by: Ben Blum <[email protected]>
---
include/linux/cgroup.h | 4 ++
kernel/cgroup.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 96 insertions(+), 0 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d7f1545..c8474c4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -38,6 +38,7 @@ extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks,
unsigned long clone_flags);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
+extern int cgroup_load_subsys(struct cgroup_subsys *ss);
extern struct file_operations proc_cgroup_operations;
@@ -477,6 +478,9 @@ struct cgroup_subsys {
/* used when use_id == true */
struct idr idr;
spinlock_t id_lock;
+
+ /* should be defined only by modular subsystems */
+ struct module *module;
};
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9966dfe..8eba8a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2491,6 +2491,7 @@ int cgroup_add_file(struct cgroup *cgrp,
error = PTR_ERR(dentry);
return error;
}
+EXPORT_SYMBOL_GPL(cgroup_add_file);
int cgroup_add_files(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
@@ -2505,6 +2506,7 @@ int cgroup_add_files(struct cgroup *cgrp,
}
return 0;
}
+EXPORT_SYMBOL_GPL(cgroup_add_files);
/**
* cgroup_task_count - count the number of tasks in a cgroup.
@@ -3650,7 +3652,97 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
mutex_init(&ss->hierarchy_mutex);
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
+
+ /* this function shouldn't be used with modular subsystems, since they
+ * need to register a subsys_id, among other things */
+ BUG_ON(ss->module);
+}
+
+/**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+ int i;
+ struct cgroup_subsys_state *css;
+
+ /* check name validity */
+ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN)
+ return -EINVAL;
+
+ /* we don't support callbacks in modular subsystems. this check is
+ * before the ss->module check for consistency - a module that *could*
+ * be a module should still have no callbacks for consistency. */
+ if (ss->fork || ss->exit)
+ return -EINVAL;
+
+ /* an optionally modular subsystem is built-in: we want to do nothing,
+ * since cgroup_init_subsys will take care of it. */
+ if (ss->module == NULL) {
+ /* sanity: ss->module NULL only if the subsys is built-in and
+ * appears in subsys[] already. */
+ BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+ BUG_ON(subsys[ss->subsys_id] != ss);
+ return 0;
+ }
+
+ /* need to register a subsys id before anything else - for example,
+ * init_cgroup_css needs it. also, subsys_mutex needs to nest outside
+ * cgroup_mutex. */
+ down_write(&subsys_mutex);
+ /* find the first empty slot in the array */
+ for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+ if (subsys[i] == NULL)
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT) {
+ /* maximum number of subsystems already registered! */
+ up_write(&subsys_mutex);
+ return -EBUSY;
+ }
+ /* assign ourselves the subsys_id */
+ ss->subsys_id = i;
+ subsys[i] = ss;
+
+ mutex_lock(&cgroup_mutex);
+ /* no ss->create seems to need anything important in the ss struct, so
+ * this can happen first (i.e. before the rootnode attachment). */
+ css = ss->create(ss, dummytop);
+ if (IS_ERR(css)) {
+ /* failure case - need to deassign the subsys[] slot. */
+ mutex_unlock(&cgroup_mutex);
+ subsys[i] = NULL;
+ up_write(&subsys_mutex);
+ return PTR_ERR(css);
+ }
+
+ list_add(&ss->sibling, &rootnode.subsys_list);
+ ss->root = &rootnode;
+
+ init_cgroup_css(css, ss, dummytop);
+ init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+
+ mutex_init(&ss->hierarchy_mutex);
+ lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+ ss->active = 1;
+
+ /* pin the subsystem's module so it doesn't go away. this shouldn't
+ * fail, since the module's initcall calls us.
+ * TODO: with module unloading, move this elsewhere */
+ BUG_ON(!try_module_get(ss->module));
+
+ /* success! */
+ mutex_unlock(&cgroup_mutex);
+ up_write(&subsys_mutex);
+ return 0;
}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
* cgroup_init_early - cgroup initialization at system boot
Allows the net_cls cgroup subsystem to be compiled as a module
From: Ben Blum <[email protected]>
This patch modifies net/sched/cls_cgroup.c to allow the net_cls subsystem to
be optionally compiled as a module instead of builtin. The cgroup_subsys
struct is moved around a bit to allow the subsys_id to be either declared as a
compile-time constant by the cgroup_subsys.h include in cgroup.h, or, if it's
a module, initialized within the struct by cgroup_load_subsys.
Signed-off-by: Ben Blum <[email protected]>
---
net/sched/Kconfig | 2 +-
net/sched/cls_cgroup.c | 37 ++++++++++++++++++++++++++++---------
2 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 929218a..8489951 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -328,7 +328,7 @@ config NET_CLS_FLOW
module will be called cls_flow.
config NET_CLS_CGROUP
- bool "Control Group Classifier"
+ tristate "Control Group Classifier"
select NET_CLS
depends on CGROUPS
---help---
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index e4877ca..df9723b 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -24,6 +24,25 @@ struct cgroup_cls_state
u32 classid;
};
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_cls_subsys = {
+ .name = "net_cls",
+ .create = cgrp_create,
+ .destroy = cgrp_destroy,
+ .populate = cgrp_populate,
+#ifdef CONFIG_NET_CLS_CGROUP
+ .subsys_id = net_cls_subsys_id,
+#else
+#define net_cls_subsys_id net_cls_subsys.subsys_id
+#endif
+ .module = THIS_MODULE,
+};
+
+
static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
{
return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -79,14 +98,6 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
}
-struct cgroup_subsys net_cls_subsys = {
- .name = "net_cls",
- .create = cgrp_create,
- .destroy = cgrp_destroy,
- .populate = cgrp_populate,
- .subsys_id = net_cls_subsys_id,
-};
-
struct cls_cgroup_head
{
u32 handle;
@@ -277,12 +288,20 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
static int __init init_cgroup_cls(void)
{
- return register_tcf_proto_ops(&cls_cgroup_ops);
+ int ret = register_tcf_proto_ops(&cls_cgroup_ops);
+ if (ret)
+ return ret;
+ ret = cgroup_load_subsys(&net_cls_subsys);
+ if (ret)
+ unregister_tcf_proto_ops(&cls_cgroup_ops);
+ return ret;
}
static void __exit exit_cgroup_cls(void)
{
unregister_tcf_proto_ops(&cls_cgroup_ops);
+ /* TODO: unload subsystem. for now, the try_module_get in load_subsys
+ * prevents us from getting here. */
}
module_init(init_cgroup_cls);
On Mon, 2 Nov 2009 16:31:03 -0500 Ben Blum wrote:
> Allows the net_cls cgroup subsystem to be compiled as a module
Please cc: netdev on networking patches. [done]
> From: Ben Blum <[email protected]>
>
> This patch modifies net/sched/cls_cgroup.c to allow the net_cls subsystem to
> be optionally compiled as a module instead of builtin. The cgroup_subsys
> struct is moved around a bit to allow the subsys_id to be either declared as a
> compile-time constant by the cgroup_subsys.h include in cgroup.h, or, if it's
> a module, initialized within the struct by cgroup_load_subsys.
>
> Signed-off-by: Ben Blum <[email protected]>
> ---
>
> net/sched/Kconfig | 2 +-
> net/sched/cls_cgroup.c | 37 ++++++++++++++++++++++++++++---------
> 2 files changed, 29 insertions(+), 10 deletions(-)
>
>
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index 929218a..8489951 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -328,7 +328,7 @@ config NET_CLS_FLOW
> module will be called cls_flow.
>
> config NET_CLS_CGROUP
> - bool "Control Group Classifier"
> + tristate "Control Group Classifier"
> select NET_CLS
> depends on CGROUPS
> ---help---
> diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> index e4877ca..df9723b 100644
> --- a/net/sched/cls_cgroup.c
> +++ b/net/sched/cls_cgroup.c
> @@ -24,6 +24,25 @@ struct cgroup_cls_state
> u32 classid;
> };
>
> +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
> + struct cgroup *cgrp);
> +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
> +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
> +
> +struct cgroup_subsys net_cls_subsys = {
> + .name = "net_cls",
> + .create = cgrp_create,
> + .destroy = cgrp_destroy,
> + .populate = cgrp_populate,
> +#ifdef CONFIG_NET_CLS_CGROUP
> + .subsys_id = net_cls_subsys_id,
> +#else
> +#define net_cls_subsys_id net_cls_subsys.subsys_id
> +#endif
> + .module = THIS_MODULE,
> +};
> +
> +
> static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
> {
> return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
> @@ -79,14 +98,6 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
> }
>
> -struct cgroup_subsys net_cls_subsys = {
> - .name = "net_cls",
> - .create = cgrp_create,
> - .destroy = cgrp_destroy,
> - .populate = cgrp_populate,
> - .subsys_id = net_cls_subsys_id,
> -};
> -
> struct cls_cgroup_head
> {
> u32 handle;
> @@ -277,12 +288,20 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
>
> static int __init init_cgroup_cls(void)
> {
> - return register_tcf_proto_ops(&cls_cgroup_ops);
> + int ret = register_tcf_proto_ops(&cls_cgroup_ops);
> + if (ret)
> + return ret;
> + ret = cgroup_load_subsys(&net_cls_subsys);
> + if (ret)
> + unregister_tcf_proto_ops(&cls_cgroup_ops);
> + return ret;
> }
>
> static void __exit exit_cgroup_cls(void)
> {
> unregister_tcf_proto_ops(&cls_cgroup_ops);
> + /* TODO: unload subsystem. for now, the try_module_get in load_subsys
> + * prevents us from getting here. */
> }
>
> module_init(init_cgroup_cls);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
---
~Randy
Ben Blum wrote:
> This patch series implements simple support for building and loading
> subsystems as modules, both within and outside the kernel source tree.
> Module unloading is as yet unimplemented - it will require more advanced
> reference counting in the mount/unmount code, and I plan to work on this
> soon.
>
I doubt the value of module-loadable subsystems. A cgroup subsystem
is usually a kernel resource controller, and normally it needs to
add some hooks in some in-kernel structures and functions, which
makes it impossible to be a module.
In fact, net_cls is the only subsystem that can be a module, and
make it a module doesn't seem to have real benefit?
> Patch #1 sets up the subsys[] array so its contents can be dynamic as
> modules appear and (eventually) disappear. I introduce an rwsem called
> subsys_mutex to protect against concurrent loads/unloads/reads, and
> modify iterations over the array to handle when subsystems are absent.
>
> Patch #2 implements an interface for modules to load subsystems, called
> cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module
> pointer in struct cgroup_subsys.
>
> Patch #3 modifies the net_cls subsystem, which already had some module
> declarations, to be configurable as a module, which also serves as a
> simple proof-of-concept.
>
> -- bblum
>
On Mon, Nov 2, 2009 at 6:03 PM, Li Zefan <[email protected]> wrote:
>
> I doubt the value of module-loadable subsystems. A cgroup subsystem
> is usually a kernel resource controller, and normally it needs to
> add some hooks in some in-kernel structures and functions, which
> makes it impossible to be a module.
Not true - as the classifier example shows, if there's already a
framework that supports hooking loadable modules into some kernel
system, a resource controller can be a module.
>
> In fact, net_cls is the only subsystem that can be a module, and
> make it a module doesn't seem to have real benefit?
Wouldn't it be just as useful as any other module-loadable classifier,
of which there are quite a few?
It's true that net_cls is currently the only subsystem that makes
sense to use as a module, but its existence shows that the concept
isn't too outlandish.
Paul
Paul Menage wrote:
> On Mon, Nov 2, 2009 at 6:03 PM, Li Zefan <[email protected]> wrote:
>> I doubt the value of module-loadable subsystems. A cgroup subsystem
>> is usually a kernel resource controller, and normally it needs to
>> add some hooks in some in-kernel structures and functions, which
>> makes it impossible to be a module.
>
> Not true - as the classifier example shows, if there's already a
> framework that supports hooking loadable modules into some kernel
> system, a resource controller can be a module.
>
True. Just reminds me a pure cfq-based io controller can be a
module too, though the io controller discussed in the mini summit
won't be a pure cfq-based one.
>> In fact, net_cls is the only subsystem that can be a module, and
>> make it a module doesn't seem to have real benefit?
>
> Wouldn't it be just as useful as any other module-loadable classifier,
> of which there are quite a few?
>
> It's true that net_cls is currently the only subsystem that makes
> sense to use as a module, but its existence shows that the concept
> isn't too outlandish.
>
Yes, sounds reasonable.
On Mon, Nov 02, 2009 at 04:30:22PM -0500, Ben Blum wrote:
> Add interface between cgroups subsystem management and module loading
>
> From: Ben Blum <[email protected]>
>
> This patch implements rudimentary module-loading support for cgroups - namely,
> a cgroup_load_subsys (similar to cgroup_init_subsys) for use as a module
> initcall, and a struct module pointer in struct cgroup_subsys. Support for
> subsystem unloading not here, yet to be done (requires more advanced reference
> counting, for one thing).
>
> I tested compiling a subsystem outside of the kernel source tree, and it was
> apparent that I wanted the EXPORT_SYMBOL for several functions, but I'm not
> sure what else it might be good to put them on. Suggestions welcome.
>
> Signed-off-by: Ben Blum <[email protected]>
> ---
>
> include/linux/cgroup.h | 4 ++
> kernel/cgroup.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 96 insertions(+), 0 deletions(-)
>
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index d7f1545..c8474c4 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -38,6 +38,7 @@ extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks,
> unsigned long clone_flags);
> extern int cgroupstats_build(struct cgroupstats *stats,
> struct dentry *dentry);
> +extern int cgroup_load_subsys(struct cgroup_subsys *ss);
>
> extern struct file_operations proc_cgroup_operations;
>
> @@ -477,6 +478,9 @@ struct cgroup_subsys {
> /* used when use_id == true */
> struct idr idr;
> spinlock_t id_lock;
> +
> + /* should be defined only by modular subsystems */
> + struct module *module;
> };
>
> #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 9966dfe..8eba8a5 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -2491,6 +2491,7 @@ int cgroup_add_file(struct cgroup *cgrp,
> error = PTR_ERR(dentry);
> return error;
> }
> +EXPORT_SYMBOL_GPL(cgroup_add_file);
>
> int cgroup_add_files(struct cgroup *cgrp,
> struct cgroup_subsys *subsys,
> @@ -2505,6 +2506,7 @@ int cgroup_add_files(struct cgroup *cgrp,
> }
> return 0;
> }
> +EXPORT_SYMBOL_GPL(cgroup_add_files);
>
> /**
> * cgroup_task_count - count the number of tasks in a cgroup.
> @@ -3650,7 +3652,97 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
> mutex_init(&ss->hierarchy_mutex);
> lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
> ss->active = 1;
> +
> + /* this function shouldn't be used with modular subsystems, since they
> + * need to register a subsys_id, among other things */
> + BUG_ON(ss->module);
> +}
> +
> +/**
> + * cgroup_load_subsys: load and register a modular subsystem at runtime
> + * @ss: the subsystem to load
> + *
> + * This function should be called in a modular subsystem's initcall. If the
> + * subsytem is built as a module, it will be assigned a new subsys_id and set
> + * up for use. If the subsystem is built-in anyway, work is delegated to the
> + * simpler cgroup_init_subsys.
> + */
> +int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
> +{
> + int i;
> + struct cgroup_subsys_state *css;
> +
> + /* check name validity */
> + if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN)
> + return -EINVAL;
> +
> + /* we don't support callbacks in modular subsystems. this check is
> + * before the ss->module check for consistency - a module that *could*
> + * be a module should still have no callbacks for consistency. */
> + if (ss->fork || ss->exit)
> + return -EINVAL;
> +
> + /* an optionally modular subsystem is built-in: we want to do nothing,
> + * since cgroup_init_subsys will take care of it. */
> + if (ss->module == NULL) {
> + /* sanity: ss->module NULL only if the subsys is built-in and
> + * appears in subsys[] already. */
> + BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
> + BUG_ON(subsys[ss->subsys_id] != ss);
> + return 0;
> + }
> +
> + /* need to register a subsys id before anything else - for example,
> + * init_cgroup_css needs it. also, subsys_mutex needs to nest outside
> + * cgroup_mutex. */
> + down_write(&subsys_mutex);
> + /* find the first empty slot in the array */
> + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
> + if (subsys[i] == NULL)
> + break;
> + }
> + if (i == CGROUP_SUBSYS_COUNT) {
> + /* maximum number of subsystems already registered! */
> + up_write(&subsys_mutex);
> + return -EBUSY;
> + }
> + /* assign ourselves the subsys_id */
> + ss->subsys_id = i;
> + subsys[i] = ss;
> +
> + mutex_lock(&cgroup_mutex);
> + /* no ss->create seems to need anything important in the ss struct, so
> + * this can happen first (i.e. before the rootnode attachment). */
> + css = ss->create(ss, dummytop);
> + if (IS_ERR(css)) {
> + /* failure case - need to deassign the subsys[] slot. */
> + mutex_unlock(&cgroup_mutex);
> + subsys[i] = NULL;
> + up_write(&subsys_mutex);
> + return PTR_ERR(css);
> + }
> +
> + list_add(&ss->sibling, &rootnode.subsys_list);
> + ss->root = &rootnode;
> +
> + init_cgroup_css(css, ss, dummytop);
> + init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
There is a problem here; namely, the init_css_set is supposed to be
immutable after boot. Putting a new subsystem state in it seems like it
will be impossible to get right, since a hierarchy might be mounted and
already using the init_css_set. Perhaps it would be better to use
find_css_set for this?
> +
> + mutex_init(&ss->hierarchy_mutex);
> + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
> + ss->active = 1;
> +
> + /* pin the subsystem's module so it doesn't go away. this shouldn't
> + * fail, since the module's initcall calls us.
> + * TODO: with module unloading, move this elsewhere */
> + BUG_ON(!try_module_get(ss->module));
> +
> + /* success! */
> + mutex_unlock(&cgroup_mutex);
> + up_write(&subsys_mutex);
> + return 0;
> }
> +EXPORT_SYMBOL_GPL(cgroup_load_subsys);
>
> /**
> * cgroup_init_early - cgroup initialization at system boot
>
On Wed, Nov 4, 2009 at 12:30 PM, Ben Blum <[email protected]> wrote:
>> + ? ? init_cgroup_css(css, ss, dummytop);
>> + ? ? init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
>
> There is a problem here; namely, the init_css_set is supposed to be
> immutable after boot. Putting a new subsystem state in it seems like it
> will be impossible to get right, since a hierarchy might be mounted and
> already using the init_css_set. Perhaps it would be better to use
> find_css_set for this?
Hmm, it's not just init_css_set that needs to get a pointer to the new
css, it's all existing css_sets - since every task has to start off in
the root cgroup for the new subsystem. A css_set is just a flattened
cache of the css pointers from all the cgroups that a task is a member
of, so when we add a css pointer to the unmount cgroup, we need to
either update all css_set objects, or else run through every task in
the system and update its cgroups pointer to a new css_set that's
equivalent to its old css_set plus the new css pointer.
Given the races inherent in trying to find and update all tasks in the
system, I think the cleanest way to solve this is:
- copy the css_set_table hash table and clear the original table.
- for each bucket in the hash table:
- for each css_set in the bucket:
- add the new root css to the css_set
- rehash the css_set into the hash table
Unfortunately this issue also makes it a bit harder to support
modular-subsystem unloading.
Paul