This starts a basic rlimit cgroup subsystem with only the
equivalent of RLIMIT_NPROC yet. This can be useful to limit
the global effects of a local fork bomb for example (local
in term of a cgroup).
The thing is further expandable to host more general resource
limitations in the scope of a cgroup.
Frederic Weisbecker (4):
cgroups: Allow a cgroup subsys to reject a fork
cgroups: Add res_counter_write_u64() API
cgroups: New resource counter inheritance API
cgroups: Add an rlimit subsystem
include/linux/cgroup.h | 9 ++-
include/linux/cgroup_subsys.h | 8 ++
include/linux/res_counter.h | 4 +
init/Kconfig | 6 ++
kernel/Makefile | 1 +
kernel/cgroup.c | 13 +++-
kernel/cgroup_freezer.c | 6 +-
kernel/cgroup_rlim.c | 142 +++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 5 +-
kernel/res_counter.c | 46 ++++++++++++--
10 files changed, 225 insertions(+), 15 deletions(-)
create mode 100644 kernel/cgroup_rlim.c
--
1.7.5.4
Make the cgroup subsystem fork callback return a value
so that subsystems are able to accept or reject a fork
completion with a custom error value.
This prepares for implementing rlimit in cgroups scope.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul Menage <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/cgroup.h | 9 ++++++---
kernel/cgroup.c | 13 ++++++++++---
kernel/cgroup_freezer.c | 6 ++++--
kernel/fork.c | 5 ++++-
4 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ab4ac0c..07213af 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,7 +32,7 @@ extern int cgroup_lock_is_held(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
+extern int cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
@@ -475,7 +475,7 @@ struct cgroup_subsys {
void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk);
void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *tsk);
- void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
+ int (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct cgroup *old_cgrp, struct task_struct *task);
int (*populate)(struct cgroup_subsys *ss,
@@ -633,7 +633,10 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_fork_callbacks(struct task_struct *p) {}
+static inline int cgroup_fork_callbacks(struct task_struct *p)
+{
+ return 0;
+}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d11..688be3d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4514,10 +4514,12 @@ void cgroup_fork(struct task_struct *child)
* tasklist. No need to take any locks since no-one can
* be operating on this task.
*/
-void cgroup_fork_callbacks(struct task_struct *child)
+int cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
+ int ret;
+
/*
* forkexit callbacks are only supported for builtin
* subsystems, and the builtin section of the subsys array is
@@ -4525,10 +4527,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
*/
for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- if (ss->fork)
- ss->fork(ss, child);
+ if (ss->fork) {
+ ret = ss->fork(ss, child);
+ if (ret)
+ return ret;
+ }
}
}
+
+ return 0;
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818..e704be6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -186,7 +186,7 @@ static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+static int freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
@@ -206,7 +206,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
* following check.
*/
if (!freezer->css.cgroup->parent)
- return;
+ return 0;
spin_lock_irq(&freezer->lock);
BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -215,6 +215,8 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
if (freezer->state == CGROUP_FREEZING)
freeze_task(task, true);
spin_unlock_irq(&freezer->lock);
+
+ return 0;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30..2625461 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,7 +1298,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
* on the tasklist. */
- cgroup_fork_callbacks(p);
+ retval = cgroup_fork_callbacks(p);
+ if (retval)
+ goto bad_fork_free_pid;
+
cgroup_callbacks_done = 1;
/* Need tasklist lock for parent etc handling! */
--
1.7.5.4
Extend the resource counter API with a mirror of
res_counter_read_u64() to make it handy to update a resource
counter value from a cgroup subsystem u64 value file.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul Menage <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/res_counter.h | 2 ++
kernel/res_counter.c | 21 +++++++++++++++------
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index c9d625c..1b3fe05 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -82,6 +82,8 @@ int res_counter_memparse_write_strategy(const char *buf,
int res_counter_write(struct res_counter *counter, int member,
const char *buffer, write_strategy_fn write_strategy);
+void res_counter_write_u64(struct res_counter *counter, int member, u64 val);
+
/*
* the field descriptors. one for each member of res_counter
*/
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683ef..806d041 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -168,12 +168,22 @@ int res_counter_memparse_write_strategy(const char *buf,
return 0;
}
+void res_counter_write_u64(struct res_counter *counter, int member, u64 val)
+{
+ unsigned long long *target;
+ unsigned long flags;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ target = res_counter_member(counter, member);
+ *target = val;
+ spin_unlock_irqrestore(&counter->lock, flags);
+}
+
int res_counter_write(struct res_counter *counter, int member,
const char *buf, write_strategy_fn write_strategy)
{
char *end;
- unsigned long flags;
- unsigned long long tmp, *val;
+ unsigned long long tmp;
if (write_strategy) {
if (write_strategy(buf, &tmp))
@@ -183,9 +193,8 @@ int res_counter_write(struct res_counter *counter, int member,
if (*end != '\0')
return -EINVAL;
}
- spin_lock_irqsave(&counter->lock, flags);
- val = res_counter_member(counter, member);
- *val = tmp;
- spin_unlock_irqrestore(&counter->lock, flags);
+
+ res_counter_write_u64(counter, member, tmp);
+
return 0;
}
--
1.7.5.4
Provide an API to inherit a counter value from a parent.
This can be useful to implement cgroup.clone_children on
a resource counter.
Still the resources of the children are limited by those
of the parent, so this is only to provide a default setting
behaviour when clone_children is set.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul Menage <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/res_counter.h | 2 ++
kernel/res_counter.c | 25 +++++++++++++++++++++++++
2 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 1b3fe05..109d118 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -84,6 +84,8 @@ int res_counter_write(struct res_counter *counter, int member,
void res_counter_write_u64(struct res_counter *counter, int member, u64 val);
+void res_counter_inherit(struct res_counter *counter, int member);
+
/*
* the field descriptors. one for each member of res_counter
*/
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 806d041..8cb0362 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -198,3 +198,28 @@ int res_counter_write(struct res_counter *counter, int member,
return 0;
}
+
+void res_counter_inherit(struct res_counter *counter, int member)
+{
+ struct res_counter *parent;
+ unsigned long long *pval, val;
+ unsigned long flags;
+
+ parent = counter->parent;
+ if (!parent)
+ return;
+
+ local_irq_save(flags);
+
+ spin_lock(&counter->lock);
+ pval = res_counter_member(counter, member);
+ val = *pval;
+ spin_unlock(&counter->lock);
+
+ spin_lock(&parent->lock);
+ pval = res_counter_member(parent, member);
+ *pval = val;
+ spin_unlock(&parent->lock);
+
+ local_irq_restore(flags);
+}
--
1.7.5.4
Add a new rlimit subsystem with a first support to limit the
number of tasks that can run inside a cgroup.
This is a step to be able to isolate a bit more a cgroup against
the rest of the system and limit the global impact of a fork bomb
or other resource abuse inside a given cgroup.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul Menage <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/cgroup_subsys.h | 8 ++
init/Kconfig | 6 ++
kernel/Makefile | 1 +
kernel/cgroup_rlim.c | 142 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 157 insertions(+), 0 deletions(-)
create mode 100644 kernel/cgroup_rlim.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..aea8cd5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,8 +59,16 @@ SUBSYS(net_cls)
SUBSYS(blkio)
#endif
+/* */
+
#ifdef CONFIG_CGROUP_PERF
SUBSYS(perf)
#endif
/* */
+
+#ifdef CONFIG_CGROUP_RLIM
+SUBSYS(rlim)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 412c21b..1a48719 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -690,6 +690,12 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
select this option (if, for some reason, they need to disable it
then noswapaccount does the trick).
+config CGROUP_RLIM
+ bool "Rlimit on cgroups"
+ depends on RESOURCE_COUNTERS
+ help
+ This option enables the rlimit settings in the scope of a cgroup.
+
config CGROUP_PERF
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
depends on PERF_EVENTS && CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfc..6713b38 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_RLIM) += cgroup_rlim.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_rlim.c b/kernel/cgroup_rlim.c
new file mode 100644
index 0000000..af69892
--- /dev/null
+++ b/kernel/cgroup_rlim.c
@@ -0,0 +1,142 @@
+/*
+ * Resource limits subsystem for cgroups
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker <[email protected]>
+ *
+ * Thanks to Johannes Weiner for his suggestions on doing this and how to.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/res_counter.h>
+
+
+struct rlim {
+ struct res_counter proc_counter;
+ struct cgroup_subsys_state css;
+};
+
+static struct rlim root_rlim;
+
+
+static inline struct rlim *cgroup_rlim(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont, rlim_subsys_id),
+ struct rlim, css);
+}
+
+static struct cgroup_subsys_state *
+rlim_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct rlim *rlim, *parent_rlim;
+
+ if (!cgrp->parent) {
+ res_counter_init(&root_rlim.proc_counter, NULL);
+ return &root_rlim.css;
+ }
+
+ rlim = kzalloc(sizeof(*rlim), GFP_KERNEL);
+ if (!rlim)
+ return ERR_PTR(-ENOMEM);
+
+ parent_rlim = cgroup_rlim(cgrp->parent);
+ res_counter_init(&rlim->proc_counter, &parent_rlim->proc_counter);
+
+ return &rlim->css;
+}
+
+static void rlim_post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct rlim *rlim = cgroup_rlim(cgrp);
+
+ res_counter_inherit(&rlim->proc_counter, RES_LIMIT);
+}
+
+static void rlim_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct rlim *rlim = cgroup_rlim(cont);
+
+ kfree(rlim);
+}
+
+static void rlim_remove_proc(struct cgroup *cgrp)
+{
+ struct rlim *rlim = cgroup_rlim(cgrp);
+
+ res_counter_uncharge(&rlim->proc_counter, 1);
+}
+
+static void rlim_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+{
+ rlim_remove_proc(old_cgrp);
+}
+
+/*
+ * This is actually where we do the attachment. We need to check if we can
+ * attach and eventually attach all in once.
+ */
+static int rlim_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+ struct rlim *rlim = cgroup_rlim(cgrp);
+ struct res_counter *limit_fail_at;
+
+ return res_counter_charge(&rlim->proc_counter, 1, &limit_fail_at);
+}
+
+static void rlim_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct task_struct *tsk)
+{
+ rlim_remove_proc(cgrp);
+}
+
+static int rlim_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ struct cgroup_subsys_state *css = task->cgroups->subsys[rlim_subsys_id];
+ struct cgroup *cgrp = css->cgroup;
+
+ return rlim_can_attach_task(cgrp, task);
+}
+
+static u64 max_proc_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+ struct rlim *rlim = cgroup_rlim(cont);
+
+ return res_counter_read_u64(&rlim->proc_counter, RES_LIMIT);
+}
+
+static int max_proc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct rlim *rlim = cgroup_rlim(cgrp);
+
+ res_counter_write_u64(&rlim->proc_counter, RES_LIMIT, val);
+
+ return 0;
+}
+
+static struct cftype cft_max_proc = {
+ .name = "max_proc",
+ .read_u64 = max_proc_read_u64,
+ .write_u64 = max_proc_write_u64,
+};
+
+static int rlim_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_file(cgrp, ss, &cft_max_proc);
+}
+
+struct cgroup_subsys rlim_subsys = {
+ .name = "rlim",
+ .subsys_id = rlim_subsys_id,
+ .create = rlim_cgroup_create,
+ .post_clone = rlim_post_clone,
+ .destroy = rlim_cgroup_destroy,
+ .exit = rlim_cgroup_exit,
+ .can_attach_task = rlim_can_attach_task,
+ .cancel_attach = rlim_cancel_attach,
+ .attach = rlim_cgroup_exit,
+ .fork = rlim_fork,
+ .populate = rlim_populate,
+ .early_init = 1,
+};
--
1.7.5.4
Frederic Weisbecker wrote:
> This starts a basic rlimit cgroup subsystem with only the
> equivalent of RLIMIT_NPROC yet. This can be useful to limit
> the global effects of a local fork bomb for example (local
> in term of a cgroup).
>
> The thing is further expandable to host more general resource
> limitations in the scope of a cgroup.
>
As this subsystem is named "rlimit", I think we should have a bigger
picture about how this subsystem will be.
For example, which of other RLIMIT_XXX can be make cgroup-aware in
a meaningful way and which can't.
Another issue is, we can apply the limit of RLIMIT_NPROC as the sum
of the tasks' limit in a cgroup, but some other RLIMIT_XXX can't
work in this way. Take RLIMIT_NICE for example, we can apply this
limit to each of the tasks in the cgroup.
On Mon, Jun 20, 2011 at 02:33:34PM +0800, Li Zefan wrote:
> Frederic Weisbecker wrote:
> > This starts a basic rlimit cgroup subsystem with only the
> > equivalent of RLIMIT_NPROC yet. This can be useful to limit
> > the global effects of a local fork bomb for example (local
> > in term of a cgroup).
> >
> > The thing is further expandable to host more general resource
> > limitations in the scope of a cgroup.
> >
>
> As this subsystem is named "rlimit", I think we should have a bigger
> picture about how this subsystem will be.
>
> For example, which of other RLIMIT_XXX can be make cgroup-aware in
> a meaningful way and which can't.
>
> Another issue is, we can apply the limit of RLIMIT_NPROC as the sum
> of the tasks' limit in a cgroup, but some other RLIMIT_XXX can't
> work in this way. Take RLIMIT_NICE for example, we can apply this
> limit to each of the tasks in the cgroup.
Looking at the other rlimit options, it seems all of them can be applied
to a cgroup. They just won't all be implemented the same way.
Those that count and limit a global user resource, like NPROC, can be
implemented using the res_counter charge/uncharge that propagate the
resource consuming to the parent cgroups.
Other rlimits that are traditionally only process wide can be implemented
here as a simple limit applied to all processes in the whole cgroup.
For example RLIMIT_CORE would be a limit in any core dump on
the whole cgroup.
RLIMIT_NOFILE would be a limit on the number of files opened by the whole
cgroup.
etc...
I think they all need to be treated case by case when/if users come and propose
more rlimit options. These don't all necessary need to mirror the setrlimit
options. We could pick existing ones but change a bit their semantics to fit
more into the cgroups meaning (as a general rule any rlimit.* file must be a
cgroup wide limitation), or create new rlimit options if specific needs arise.
There can be another kind of rlimit options that can be cgroup wide but apply
per process, in which case we should tweak a bit the name of the rlimit option file.
Consider RLIMIT_STACK for example.
If we want a cgroup option that applies to the total of stack used by the whole
cgroup, the file name would be rlim.stack. If we want that limitation to happen
to the whole cgroup but per process, it would be rlim.stack_per_process.
03:11, Frederic Weisbecker wrote:
> On Mon, Jun 20, 2011 at 02:33:34PM +0800, Li Zefan wrote:
>> Frederic Weisbecker wrote:
>>> This starts a basic rlimit cgroup subsystem with only the
>>> equivalent of RLIMIT_NPROC yet. This can be useful to limit
>>> the global effects of a local fork bomb for example (local
>>> in term of a cgroup).
>>>
>>> The thing is further expandable to host more general resource
>>> limitations in the scope of a cgroup.
>>>
>>
>> As this subsystem is named "rlimit", I think we should have a bigger
>> picture about how this subsystem will be.
>>
>> For example, which of other RLIMIT_XXX can be make cgroup-aware in
>> a meaningful way and which can't.
>>
>> Another issue is, we can apply the limit of RLIMIT_NPROC as the sum
>> of the tasks' limit in a cgroup, but some other RLIMIT_XXX can't
>> work in this way. Take RLIMIT_NICE for example, we can apply this
>> limit to each of the tasks in the cgroup.
>
> Looking at the other rlimit options, it seems all of them can be applied
> to a cgroup. They just won't all be implemented the same way.
>
> Those that count and limit a global user resource, like NPROC, can be
> implemented using the res_counter charge/uncharge that propagate the
> resource consuming to the parent cgroups.
>
res_counter seems a bit overkill while atomic should be sufficient for
NPROC? Especially when it affects the fork path.
Normally we want to make the impact minimal when cgroup is not used,
so we may treat the root cgroup somewhat special, and one choice is to
always make it resource unlimited.
> Other rlimits that are traditionally only process wide can be implemented
> here as a simple limit applied to all processes in the whole cgroup.
>
> For example RLIMIT_CORE would be a limit in any core dump on
> the whole cgroup.
>
> RLIMIT_NOFILE would be a limit on the number of files opened by the whole
> cgroup.
>
> etc...
>
> I think they all need to be treated case by case when/if users come and propose
> more rlimit options. These don't all necessary need to mirror the setrlimit
> options. We could pick existing ones but change a bit their semantics to fit
> more into the cgroups meaning (as a general rule any rlimit.* file must be a
> cgroup wide limitation), or create new rlimit options if specific needs arise.
>
> There can be another kind of rlimit options that can be cgroup wide but apply
> per process, in which case we should tweak a bit the name of the rlimit option file.
> Consider RLIMIT_STACK for example.
> If we want a cgroup option that applies to the total of stack used by the whole
> cgroup, the file name would be rlim.stack. If we want that limitation to happen
> to the whole cgroup but per process, it would be rlim.stack_per_process.
>
Or use a single cgroup interface for different rlimits, since all rlmits can be
applied per process.
On Tue, Jun 21, 2011 at 04:09:25PM +0800, Li Zefan wrote:
> 03:11, Frederic Weisbecker wrote:
> > On Mon, Jun 20, 2011 at 02:33:34PM +0800, Li Zefan wrote:
> >> Frederic Weisbecker wrote:
> >>> This starts a basic rlimit cgroup subsystem with only the
> >>> equivalent of RLIMIT_NPROC yet. This can be useful to limit
> >>> the global effects of a local fork bomb for example (local
> >>> in term of a cgroup).
> >>>
> >>> The thing is further expandable to host more general resource
> >>> limitations in the scope of a cgroup.
> >>>
> >>
> >> As this subsystem is named "rlimit", I think we should have a bigger
> >> picture about how this subsystem will be.
> >>
> >> For example, which of other RLIMIT_XXX can be make cgroup-aware in
> >> a meaningful way and which can't.
> >>
> >> Another issue is, we can apply the limit of RLIMIT_NPROC as the sum
> >> of the tasks' limit in a cgroup, but some other RLIMIT_XXX can't
> >> work in this way. Take RLIMIT_NICE for example, we can apply this
> >> limit to each of the tasks in the cgroup.
> >
> > Looking at the other rlimit options, it seems all of them can be applied
> > to a cgroup. They just won't all be implemented the same way.
> >
> > Those that count and limit a global user resource, like NPROC, can be
> > implemented using the res_counter charge/uncharge that propagate the
> > resource consuming to the parent cgroups.
> >
>
> res_counter seems a bit overkill while atomic should be sufficient for
> NPROC? Especially when it affects the fork path.
Agreed. And my first home version of this patchset was not using res_counter
but atomic ops, just because I didn't know res_counter in the beginning :)
So I have that code about ready.
That said res_counter API is still a perfect fit for this: it handles all the
tracking to the parents, the failure path, etc... It may be an overkill for
this subsystem in the implementation level, but not semantically.
Would it make sense to eventually optimize res_counter rather than creating
an ad hoc clone of it that uses atomic ops?
Note it means that instead of having this:
spin_lock();
if (counter + val < limit)
counter += val;
spin_unlock();
We'll have this:
if (atomic_add_return(counter, val) >= limit)
atomic_sub(counter, val)
It is fine for proc counting. But is it fine to have temporary wrong counter
for other users of res_counter? If not we can still use something based on
atomic_cmpxchg() but then I'm not sure it's worth instead of using spinlock.
>
> Normally we want to make the impact minimal when cgroup is not used,
> so we may treat the root cgroup somewhat special, and one choice is to
> always make it resource unlimited.
Makes sense. But then it would be wiser not to create the rlim.nr_proc file
for the root cgroup. Is that possible with the current API? If not I can extend it
if needed.
> > Other rlimits that are traditionally only process wide can be implemented
> > here as a simple limit applied to all processes in the whole cgroup.
> >
> > For example RLIMIT_CORE would be a limit in any core dump on
> > the whole cgroup.
> >
> > RLIMIT_NOFILE would be a limit on the number of files opened by the whole
> > cgroup.
> >
> > etc...
> >
> > I think they all need to be treated case by case when/if users come and propose
> > more rlimit options. These don't all necessary need to mirror the setrlimit
> > options. We could pick existing ones but change a bit their semantics to fit
> > more into the cgroups meaning (as a general rule any rlimit.* file must be a
> > cgroup wide limitation), or create new rlimit options if specific needs arise.
> >
> > There can be another kind of rlimit options that can be cgroup wide but apply
> > per process, in which case we should tweak a bit the name of the rlimit option file.
> > Consider RLIMIT_STACK for example.
> > If we want a cgroup option that applies to the total of stack used by the whole
> > cgroup, the file name would be rlim.stack. If we want that limitation to happen
> > to the whole cgroup but per process, it would be rlim.stack_per_process.
> >
>
> Or use a single cgroup interface for different rlimits, since all rlmits can be
> applied per process.
I'm not sure what you mean there.
Hi Frederick,
On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> This starts a basic rlimit cgroup subsystem with only the
> equivalent of RLIMIT_NPROC yet. This can be useful to limit
> the global effects of a local fork bomb for example (local
> in term of a cgroup).
My general thoughts on this are:
- do we really want an "rlimit" subsystem rather than grouping things
functionally? We definitely shouldn't just stuff things in here
because they happen to be controlled via setrlimit currently. Also,
some limits might fit more appropriately in other subsystems. (E.g.
max locked memory should be a memcg field, and real-time priority
should be in the cpu subsystem if it's not already subsumed by
existing functionality). Grouping "rlimit" things together in a single
subsystem reduces flexibility, since you can't then mount them on
separate hierarchies. (This is actually related to one of my regrets
about the original implementation of cgroups - the cpuset subsystem
should have been split into a "cpunode" subsystem and a "memnode"
subsystem, since the two parts of cpusets had no requirement to be
located together - they were only linked since before cgroups there
was no way to mount them separately).
A lot of the rlimit values are more for the benefit of the process (to
prevent runaways) rather than for resource isolation - data segment
size, file size, stack size, pending signals, virtual memory limits
fall into that category, i think - they're all resource usage that
falls under existing cgroup resource limits, such as
memory.limit_in_bytes.
Task count is a little blurry in this regard - the main resources that
you can consume with a fork bomb are CPU cycles and memory, both of
which are already isolated by existing subsystems, so arguably there
shouldn't be a need to control the number of tasks itself. But I'm
prepared to believe that there are still bits of the kernel that have
arbitrary machine-wide limits that can be hit simply by forking a
massive number of processes, even if they're not using much memory or
CPU cycles.
So for this case, I'd suggest that the best option is to have a
numtasks subsystem with "count" and "limit" files. Future rlimit
options can go in their own subsystems or be attached to existing
subsystems if that makes sense.
Paul
On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> +static void rlim_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> + struct cgroup *old_cgrp, struct task_struct *task)
> +{
> + rlim_remove_proc(old_cgrp);
> +}
Since this is used for both the exit callback and the attach callback,
it should have a more generic name.
> +static int rlim_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> +{
> + ? ? ? struct rlim *rlim = cgroup_rlim(cgrp);
> + ? ? ? struct res_counter *limit_fail_at;
> +
> + ? ? ? return res_counter_charge(&rlim->proc_counter, 1, &limit_fail_at);
> +}
Can't this fail spuriously in the presence of hierarchies?
E.g. if cgroup A has children, and A is at its limit, then moving
tasks around between A and its children, or between different children
of A, seems like it would fail due to the temporary double counting.
Paul
On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> Make the cgroup subsystem fork callback return a value
> so that subsystems are able to accept or reject a fork
> completion with a custom error value.
This is unnecessary complexity in the cgroup subsystem (and seems to
miss cleanup for subsystems that have previously had their fork()
method return success).
If you want a subsystem to be able to reject a fork, I think it's
better to have that subsystem be called explicitly from do_fork(), and
keep that logic out of cgroups.
Paul
On Tue, Jun 21, 2011 at 10:08:26AM -0700, Paul Menage wrote:
> Hi Frederick,
>
> On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> > This starts a basic rlimit cgroup subsystem with only the
> > equivalent of RLIMIT_NPROC yet. This can be useful to limit
> > the global effects of a local fork bomb for example (local
> > in term of a cgroup).
>
> My general thoughts on this are:
>
> - do we really want an "rlimit" subsystem rather than grouping things
> functionally? We definitely shouldn't just stuff things in here
> because they happen to be controlled via setrlimit currently. Also,
> some limits might fit more appropriately in other subsystems. (E.g.
> max locked memory should be a memcg field, and real-time priority
> should be in the cpu subsystem if it's not already subsumed by
> existing functionality). Grouping "rlimit" things together in a single
> subsystem reduces flexibility, since you can't then mount them on
> separate hierarchies. (This is actually related to one of my regrets
> about the original implementation of cgroups - the cpuset subsystem
> should have been split into a "cpunode" subsystem and a "memnode"
> subsystem, since the two parts of cpusets had no requirement to be
> located together - they were only linked since before cgroups there
> was no way to mount them separately).
>
> A lot of the rlimit values are more for the benefit of the process (to
> prevent runaways) rather than for resource isolation - data segment
> size, file size, stack size, pending signals, virtual memory limits
> fall into that category, i think - they're all resource usage that
> falls under existing cgroup resource limits, such as
> memory.limit_in_bytes.
Yeah I all agree with you. To mimic rlmit inside a cgroup subsystem
would be a bad thing given we already have subsystems where some of
the rlimit options can fit and moreover your message made me read
again the part about hierarchies in cgroup documentation. I
eventually understood the idea/point of building parallel hierarchies with
different subsystems mounted in it, and thus eventually I understand
your point about the problem on flexibility implied by an everything-rlimit
subsystem.
> Task count is a little blurry in this regard - the main resources that
> you can consume with a fork bomb are CPU cycles and memory, both of
> which are already isolated by existing subsystems, so arguably there
> shouldn't be a need to control the number of tasks itself. But I'm
> prepared to believe that there are still bits of the kernel that have
> arbitrary machine-wide limits that can be hit simply by forking a
> massive number of processes, even if they're not using much memory or
> CPU cycles.
Yeah I've just asked Johannes Weiner about that and he told me
can't use memory limits for that as these don't handle kernel
memory.
> So for this case, I'd suggest that the best option is to have a
> numtasks subsystem with "count" and "limit" files. Future rlimit
> options can go in their own subsystems or be attached to existing
> subsystems if that makes sense.
Agreed about future rlimit options, but building a single purpose
numtask subsystem looks a bit strange. Just because it looks too much
single purpose. OTOH I can't figure out any other kind of future
limitation that should fit aside in a very similar topic, enough
that we wouldn't care about seperating both for flexibility.
So I guess I'm going to take that way.
Thanks!
On Tue, Jun 21, 2011 at 10:39:04AM -0700, Paul Menage wrote:
> On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> > Make the cgroup subsystem fork callback return a value
> > so that subsystems are able to accept or reject a fork
> > completion with a custom error value.
>
> This is unnecessary complexity in the cgroup subsystem (and seems to
> miss cleanup for subsystems that have previously had their fork()
> method return success).
No it seems only the freezer subsystem had this fork callback implemented.
But indeed it adds complexity because if a subsystem can cancel a fork,
then previous subsystems that called ->fork() would need to have a
kind cancel_fork() callback to call.
I haven't looked very deep but the freezer doesn't seem to need any
rollback. Future subsystems using the fork() callback may need it
though.
> If you want a subsystem to be able to reject a fork, I think it's
> better to have that subsystem be called explicitly from do_fork(), and
> keep that logic out of cgroups.
Ok I can do that.
On Tue, Jun 21, 2011 at 10:37:03AM -0700, Paul Menage wrote:
> On Sun, Jun 19, 2011 at 4:51 PM, Frederic Weisbecker <[email protected]> wrote:
> > +static void rlim_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
> > + struct cgroup *old_cgrp, struct task_struct *task)
> > +{
> > + rlim_remove_proc(old_cgrp);
> > +}
>
> Since this is used for both the exit callback and the attach callback,
> it should have a more generic name.
Right.
> > +static int rlim_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
> > +{
> > + ? ? ? struct rlim *rlim = cgroup_rlim(cgrp);
> > + ? ? ? struct res_counter *limit_fail_at;
> > +
> > + ? ? ? return res_counter_charge(&rlim->proc_counter, 1, &limit_fail_at);
> > +}
>
> Can't this fail spuriously in the presence of hierarchies?
>
> E.g. if cgroup A has children, and A is at its limit, then moving
> tasks around between A and its children, or between different children
> of A, seems like it would fail due to the temporary double counting.
Good point. Probably I should first uncharge the old cgroup and its parents.
On Thu, Jun 23, 2011 at 6:30 AM, Frederic Weisbecker <[email protected]> wrote:
> Agreed about future rlimit options, but building a single purpose
> numtask subsystem looks a bit strange. Just because it looks too much
> single purpose.
Admittedly most subsystems do have more knobs and features, but I
don't think it's unreasonable for a subsystem to provide just a single
feature. In theory the overhead is going to be very low.
Paul
On Thu, Jun 23, 2011 at 6:48 AM, Frederic Weisbecker <[email protected]> wrote:
>> Can't this fail spuriously in the presence of hierarchies?
>>
>> E.g. if cgroup A has children, and A is at its limit, then moving
>> tasks around between A and its children, or between different children
>> of A, seems like it would fail due to the temporary double counting.
>
> Good point. Probably I should first uncharge the old cgroup and its parents.
>
That can fail too - if a new task gets created between the uncharge
and the charge.
What we need is a res_counter_move_charge(A, B, amount) function which will:
- locate C, the nearest common ancestor of A and B
- lock up the chain from B up to but not including C, adding the new charge
- unlock up the chain from B to C
- uncharge along the chain from A up to but not including C (not sure
how much locking is needed there since there's no need for roll back).
Paul
Paul Menage <menage <at> google.com> writes:
> What we need is a res_counter_move_charge(A, B, amount) function which will:
>
> - locate C, the nearest common ancestor of A and B
> - lock up the chain from B up to but not including C, adding the new charge
> - unlock up the chain from B to C
> - uncharge along the chain from A up to but not including C (not sure
> how much locking is needed there since there's no need for roll back).
>
> Paul
>
Another alternative is to use the 'attach' callback in struct cgroup_subsys which
gets both the old cgroup and the new cgroup as parameters and do
rlim_remove_proc(old_cgrp) and res_counter_charge(new_cgrp) in this same
function under the protection of a spinlock.
It would be good to add a return value to the 'attach' callback too.
Frederic Weisbecker <fweisbec <at> gmail.com> writes:
> +struct rlim {
> + struct res_counter proc_counter;
> + struct cgroup_subsys_state css;
> +};
This can be used to enforce limits on FDs (RLIMIT_NOFILE), TCP/UDP ports,
locked memory, queued data on sockets, etc. How making proc_counter an
array (currently of size 1) ? Though with this patch there is just one, but it
will great to have ready support for easily adding more.
> +static struct cftype cft_max_proc = {
> + .name = "max_proc",
> + .read_u64 = max_proc_read_u64,
> + .write_u64 = max_proc_write_u64,
> +};
How about exporting USAGE, MAX_USAGE and FAILCNT too? These are useful
for resource estimation.