The current state of resource limitation for the number of open
processes (as well as the number of open file descriptors) requires you
to use setrlimit(2), which means that you are limited to resource
limiting process trees rather than resource limiting cgroups (which is
the point of cgroups).
There was a patch to implement this in 2011[1], but that was rejected
because it implemented a general-purpose rlimit subsystem -- which meant
that you couldn't control distinct resource limits in different
heirarchies. This patch implements a resource controller *specifically*
for the number of processes in a cgroup, overcoming this issue.
There has been a similar attempt to implement a resource controller for
the number of open file descriptors[2], which has not been merged
becasue the reasons were dubious. Merely from a "sane interface"
perspective, it should be possible to utilise cgroups to do such
rudimentary resource management (which currently only exists for process
trees).
Aleksa Sarai (2):
cgroups: allow a cgroup subsystem to reject a fork
cgroups: add an nproc subsystem
include/linux/cgroup.h | 9 ++-
include/linux/cgroup_subsys.h | 4 +
init/Kconfig | 10 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 13 ++-
kernel/cgroup_freezer.c | 6 +-
kernel/cgroup_nproc.c | 181 ++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 4 +-
kernel/sched/core.c | 3 +-
9 files changed, 221 insertions(+), 10 deletions(-)
create mode 100644 kernel/cgroup_nproc.c
[1]: https://lkml.org/lkml/2011/6/19/170
[2]: https://lkml.org/lkml/2014/7/2/640
--
2.3.0
NOTE: I'm not sure if I'm doing enough cleanup inside copy_process(),
because a bunch of stuff happens between the last valid goto to the
bad_fork_free_pid label and cgroup_post_fork().
What is the correct way of doing cleanup this late inside
copy_process()?
8<----------------------------------------------------------------------
Make the cgroup subsystem post fork callback return an error code so
that subsystems can accept or reject a fork from completing with a
custom error value.
This is in preparation for implementing the numtasks cgroup scope.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup.h | 9 ++++++---
kernel/cgroup.c | 13 ++++++++++---
kernel/cgroup_freezer.c | 6 ++++--
kernel/fork.c | 4 +++-
kernel/sched/core.c | 3 ++-
5 files changed, 25 insertions(+), 10 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da0dae0..91718ff 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,7 +32,7 @@ struct cgroup;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
@@ -649,7 +649,7 @@ struct cgroup_subsys {
struct cgroup_taskset *tset);
void (*attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
- void (*fork)(struct task_struct *task);
+ int (*fork)(struct task_struct *task);
void (*exit)(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task);
@@ -946,7 +946,10 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_post_fork(struct task_struct *p)
+{
+ return 0;
+}
static inline void cgroup_exit(struct task_struct *p) {}
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8a..82ecb6f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5191,7 +5191,7 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+int cgroup_post_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
@@ -5236,10 +5236,17 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
if (need_forkexit_callback) {
+ int ret;
+
for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
+ if (ss->fork) {
+ ret = ss->fork(child);
+ if (ret)
+ return ret;
+ }
}
+
+ return 0;
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc..f5906b7 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static int freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
@@ -215,7 +215,7 @@ static void freezer_fork(struct task_struct *task)
* right thing to do.
*/
if (task_css_is_root(task, freezer_cgrp_id))
- return;
+ return 0;
mutex_lock(&freezer_mutex);
rcu_read_lock();
@@ -226,6 +226,8 @@ static void freezer_fork(struct task_struct *task)
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+
+ return 0;
}
/**
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2dda..ff12e23 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1541,7 +1541,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ retval = cgroup_post_fork(p);
+ if (retval)
+ goto bad_fork_free_pid;
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5eab11d..9b9f970 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8010,9 +8010,10 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static int cpu_cgroup_fork(struct task_struct *task)
{
sched_move_task(task);
+ return 0;
}
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
--
2.3.0
Adds a new single-purpose nproc subsystem to limit the number of
tasks that can run inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that will applies to a cgroup rather than
a process tree.
This is a step to being able to limit the global impact of a fork bomb
inside a cgroup, allowing for cgroups to perform fairly basic resource
limitation which it currently doesn't have the capability to do.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup_subsys.h | 4 +
init/Kconfig | 10 +++
kernel/Makefile | 1 +
kernel/cgroup_nproc.c | 181 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 196 insertions(+)
create mode 100644 kernel/cgroup_nproc.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b..e83e0ac 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -47,6 +47,10 @@ SUBSYS(net_prio)
SUBSYS(hugetlb)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_NPROC)
+SUBSYS(nproc)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..d6315fe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1047,6 +1047,16 @@ config CGROUP_HUGETLB
control group is tracked in the third page lru pointer. This means
that we cannot use the controller with huge page less than 3 pages.
+config CGROUP_NPROC
+ bool "Process number limiting on cgroups"
+ depends on PAGE_COUNTER
+ help
+ This options enables the setting of process number limits in the scope
+ of a cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. This allows for more basic resource limitation that
+ applies to a cgroup, similar to RLIMIT_NPROC (except that instead of
+ applying to a process tree it applies to a cgroup).
+
config CGROUP_PERF
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
depends on PERF_EVENTS && CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a..10c4b40 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_NPROC) += cgroup_nproc.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_nproc.c b/kernel/cgroup_nproc.c
new file mode 100644
index 0000000..414d8d5
--- /dev/null
+++ b/kernel/cgroup_nproc.c
@@ -0,0 +1,181 @@
+/*
+ * Process number limiting subsys for cgroups.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <[email protected]>
+ *
+ * Thanks to Frederic Weisbecker for creating the seminal patches which lead to
+ * this being written.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/page_counter.h>
+
+struct nproc {
+ struct page_counter proc_counter;
+ struct cgroup_subsys_state css;
+};
+
+static inline struct nproc *css_nproc(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct nproc, css) : NULL;
+}
+
+static inline struct nproc *task_nproc(struct task_struct *task)
+{
+ return css_nproc(task_css(task, nproc_cgrp_id));
+}
+
+static struct nproc *parent_nproc(struct nproc *nproc)
+{
+ return css_nproc(nproc->css.parent);
+}
+
+static struct cgroup_subsys_state *nproc_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct nproc *nproc;
+
+ nproc = kzalloc(sizeof(struct nproc), GFP_KERNEL);
+ if (!nproc)
+ return ERR_PTR(-ENOMEM);
+
+ return &nproc->css;
+}
+
+static int nproc_css_online(struct cgroup_subsys_state *css)
+{
+ struct nproc *nproc = css_nproc(css);
+ struct nproc *parent = parent_nproc(nproc);
+
+ if (!parent) {
+ page_counter_init(&nproc->proc_counter, NULL);
+ return 0;
+ }
+
+ page_counter_init(&nproc->proc_counter, &parent->proc_counter);
+ return page_counter_limit(&nproc->proc_counter, parent->proc_counter.limit);
+}
+
+static void nproc_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_nproc(css));
+}
+
+static inline void nproc_remove_procs(struct nproc *nproc, int num_procs)
+{
+ page_counter_uncharge(&nproc->proc_counter, num_procs);
+}
+
+static inline int nproc_add_procs(struct nproc *nproc, int num_procs)
+{
+ struct page_counter *fail_at;
+ int errcode;
+
+ errcode = page_counter_try_charge(&nproc->proc_counter, num_procs, &fail_at);
+ if (errcode)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static void nproc_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct nproc *nproc = css_nproc(css);
+ unsigned long num_tasks = 0;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ num_tasks++;
+
+ nproc_remove_procs(nproc, num_tasks);
+}
+
+static int nproc_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct nproc *nproc = css_nproc(css);
+ unsigned long num_tasks = 0;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ num_tasks++;
+
+ return nproc_add_procs(nproc, num_tasks);
+}
+
+static int nproc_fork(struct task_struct *task)
+{
+ struct nproc *nproc = task_nproc(task);
+
+ return nproc_add_procs(nproc, 1);
+}
+
+static void nproc_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct nproc *nproc = css_nproc(old_css);
+
+ nproc_remove_procs(nproc, 1);
+}
+
+static int nproc_write_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return page_counter_limit(&nproc->proc_counter, val);
+}
+
+static u64 nproc_read_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return nproc->proc_counter.limit;
+}
+
+static u64 nproc_read_max_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return PAGE_COUNTER_MAX;
+}
+
+static u64 nproc_read_usage(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return page_counter_read(&nproc->proc_counter);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "limit",
+ .write_u64 = nproc_write_limit,
+ .read_u64 = nproc_read_limit,
+ },
+ {
+ .name = "max_limit",
+ .read_u64 = nproc_read_max_limit,
+ },
+ {
+ .name = "usage",
+ .read_u64 = nproc_read_usage,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys nproc_cgrp_subsys = {
+ .css_alloc = nproc_css_alloc,
+ .css_online = nproc_css_online,
+ .css_free = nproc_css_free,
+ .can_attach = nproc_can_attach,
+ .cancel_attach = nproc_cancel_attach,
+ .fork = nproc_fork,
+ .exit = nproc_exit,
+ .legacy_cftypes = files,
+ .early_init = 0,
+};
--
2.3.0
On Mon, Feb 23, 2015 at 02:08:10PM +1100, Aleksa Sarai wrote:
> NOTE: I'm not sure if I'm doing enough cleanup inside copy_process(),
> because a bunch of stuff happens between the last valid goto to the
> bad_fork_free_pid label and cgroup_post_fork().
>
> What is the correct way of doing cleanup this late inside
> copy_process()?
Its not; you're past the point of fail. You've already exposed the new
process.
If you want to allow fail, you'll have to do it earlier.
This is an updated version of the nproc patchset[1], in which the forking
cleanup issue has been resolved by adding can_fork and cancel_fork
callbacks to cgroup subsystems. The can_fork callback is run early
enough that it doesn't get called after the "point of no return" where
the process is exposed (which is when fork) is called, and cancel_fork
is run during the cleanup of copy_process if the fork fails due to other
reasons.
[1]: https://lkml.org/lkml/2015/2/22/204
Aleksa Sarai (2):
cgroups: allow a cgroup subsystem to reject a fork
cgroups: add an nproc subsystem
include/linux/cgroup.h | 9 ++
include/linux/cgroup_subsys.h | 4 +
init/Kconfig | 10 +++
kernel/Makefile | 1 +
kernel/cgroup.c | 80 +++++++++++++----
kernel/cgroup_nproc.c | 198 ++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 12 ++-
7 files changed, 296 insertions(+), 18 deletions(-)
create mode 100644 kernel/cgroup_nproc.c
--
2.3.1
Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected with a cgroup
policy.
Make the cgroup subsystem can_fork callback return an error code so
that subsystems can accept or reject a fork from completing with a
custom error value, before the process is exposed.
In addition, add a cancel_fork callback so that if an error occurs later
in the forking process, any state modified by can_fork can be reverted.
In order for can_fork to deal with a task that has an accurate css_set,
move the css_set updating to cgroup_fork (where it belongs).
This is in preparation for implementing the nproc cgroup subsystem.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup.h | 9 ++++++
kernel/cgroup.c | 80 +++++++++++++++++++++++++++++++++++++++-----------
kernel/fork.c | 12 +++++++-
3 files changed, 83 insertions(+), 18 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da0dae0..9897533 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -32,6 +32,8 @@ struct cgroup;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
extern void cgroup_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p);
+extern void cgroup_cancel_fork(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p);
extern int cgroupstats_build(struct cgroupstats *stats,
@@ -649,6 +651,8 @@ struct cgroup_subsys {
struct cgroup_taskset *tset);
void (*attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
+ int (*can_fork)(struct task_struct *task);
+ void (*cancel_fork)(struct task_struct *task);
void (*fork)(struct task_struct *task);
void (*exit)(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
@@ -946,6 +950,11 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p)
+{
+ return 0;
+}
+static inline void cgroup_cancel_fork(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline void cgroup_exit(struct task_struct *p) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 04cfe8a..f062350 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4928,7 +4928,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
- need_forkexit_callback |= ss->fork || ss->exit;
+ need_forkexit_callback |= ss->can_fork || ss->cancel_fork || ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -5179,22 +5179,6 @@ void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
-}
-
-/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
- */
-void cgroup_post_fork(struct task_struct *child)
-{
- struct cgroup_subsys *ss;
- int i;
/*
* This may race against cgroup_enable_task_cg_lists(). As that
@@ -5229,6 +5213,68 @@ void cgroup_post_fork(struct task_struct *child)
}
up_write(&css_set_rwsem);
}
+}
+
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed.
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ if (need_forkexit_callback) {
+ int retval;
+
+ for_each_subsys(ss, i)
+ if (ss->can_fork) {
+ retval = ss->can_fork(child);
+ if (retval)
+ return retval;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ if (need_forkexit_callback) {
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child);
+ }
+}
+
+/**
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
+ * list.
+ */
+void cgroup_post_fork(struct task_struct *child)
+{
+ struct cgroup_subsys *ss;
+ int i;
/*
* Call ss->fork(). This must happen after @child is linked on
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2dda..e84ce86 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1464,6 +1464,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked.
+ */
+ retval = cgroup_can_fork(p);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1499,7 +1507,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cgroup_cancel;
}
if (likely(p->pid)) {
@@ -1551,6 +1559,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cgroup_cancel:
+ cgroup_cancel_fork(p);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
--
2.3.1
Adds a new single-purpose nproc subsystem to limit the number of
tasks that can run inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that will applies to a cgroup rather than
a process tree.
This is a step to being able to limit the global impact of a fork bomb
inside a cgroup, allowing for cgroups to perform fairly basic resource
limitation which it currently doesn't have the capability to do.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup_subsys.h | 4 +
init/Kconfig | 10 +++
kernel/Makefile | 1 +
kernel/cgroup_nproc.c | 198 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 213 insertions(+)
create mode 100644 kernel/cgroup_nproc.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b..e83e0ac 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -47,6 +47,10 @@ SUBSYS(net_prio)
SUBSYS(hugetlb)
#endif
+#if IS_ENABLED(CONFIG_CGROUP_NPROC)
+SUBSYS(nproc)
+#endif
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
diff --git a/init/Kconfig b/init/Kconfig
index 9afb971..d6315fe 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1047,6 +1047,16 @@ config CGROUP_HUGETLB
control group is tracked in the third page lru pointer. This means
that we cannot use the controller with huge page less than 3 pages.
+config CGROUP_NPROC
+ bool "Process number limiting on cgroups"
+ depends on PAGE_COUNTER
+ help
+ This options enables the setting of process number limits in the scope
+ of a cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. This allows for more basic resource limitation that
+ applies to a cgroup, similar to RLIMIT_NPROC (except that instead of
+ applying to a process tree it applies to a cgroup).
+
config CGROUP_PERF
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
depends on PERF_EVENTS && CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a..10c4b40 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_NPROC) += cgroup_nproc.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_nproc.c b/kernel/cgroup_nproc.c
new file mode 100644
index 0000000..86de0fe
--- /dev/null
+++ b/kernel/cgroup_nproc.c
@@ -0,0 +1,198 @@
+/*
+ * Process number limiting subsys for cgroups.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <[email protected]>
+ *
+ * Thanks to Frederic Weisbecker for creating the seminal patches which lead to
+ * this being written.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/page_counter.h>
+
+struct nproc {
+ struct page_counter proc_counter;
+ struct cgroup_subsys_state css;
+};
+
+static inline struct nproc *css_nproc(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct nproc, css) : NULL;
+}
+
+static inline struct nproc *task_nproc(struct task_struct *task)
+{
+ return css_nproc(task_css(task, nproc_cgrp_id));
+}
+
+static struct nproc *parent_nproc(struct nproc *nproc)
+{
+ return css_nproc(nproc->css.parent);
+}
+
+static struct cgroup_subsys_state *nproc_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct nproc *nproc;
+
+ nproc = kzalloc(sizeof(struct nproc), GFP_KERNEL);
+ if (!nproc)
+ return ERR_PTR(-ENOMEM);
+
+ return &nproc->css;
+}
+
+static int nproc_css_online(struct cgroup_subsys_state *css)
+{
+ struct nproc *nproc = css_nproc(css);
+ struct nproc *parent = parent_nproc(nproc);
+
+ if (!parent) {
+ page_counter_init(&nproc->proc_counter, NULL);
+ return 0;
+ }
+
+ page_counter_init(&nproc->proc_counter, &parent->proc_counter);
+ return page_counter_limit(&nproc->proc_counter, parent->proc_counter.limit);
+}
+
+static void nproc_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_nproc(css));
+}
+
+static inline void nproc_remove_procs(struct nproc *nproc, int num_procs)
+{
+ page_counter_uncharge(&nproc->proc_counter, num_procs);
+}
+
+static inline int nproc_add_procs(struct nproc *nproc, int num_procs)
+{
+ struct page_counter *fail_at;
+ int errcode;
+
+ errcode = page_counter_try_charge(&nproc->proc_counter, num_procs, &fail_at);
+ if (errcode)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int nproc_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct nproc *nproc = css_nproc(css);
+ unsigned long num_tasks = 0;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ num_tasks++;
+
+ return nproc_add_procs(nproc, num_tasks);
+}
+
+static void nproc_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct nproc *nproc = css_nproc(css);
+ unsigned long num_tasks = 0;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ num_tasks++;
+
+ nproc_remove_procs(nproc, num_tasks);
+}
+
+static int nproc_can_fork(struct task_struct *task)
+{
+ struct nproc *nproc = task_nproc(task);
+
+ return nproc_add_procs(nproc, 1);
+}
+
+static void nproc_cancel_fork(struct task_struct *task)
+{
+ struct nproc *nproc = task_nproc(task);
+
+ nproc_remove_procs(nproc, 1);
+}
+
+static void nproc_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct nproc *nproc = css_nproc(old_css);
+
+ /*
+ * cgroup_exit() gets called as part of the cleanup code when copy_process()
+ * fails. This should ignored, because the nproc_cancel_fork callback already
+ * deals with the cgroup failed fork case.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ nproc_remove_procs(nproc, 1);
+}
+
+static int nproc_write_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return page_counter_limit(&nproc->proc_counter, val);
+}
+
+static u64 nproc_read_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return nproc->proc_counter.limit;
+}
+
+static u64 nproc_read_max_limit(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return PAGE_COUNTER_MAX;
+}
+
+static u64 nproc_read_usage(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct nproc *nproc = css_nproc(css);
+
+ return page_counter_read(&nproc->proc_counter);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "limit",
+ .write_u64 = nproc_write_limit,
+ .read_u64 = nproc_read_limit,
+ },
+ {
+ .name = "max_limit",
+ .read_u64 = nproc_read_max_limit,
+ },
+ {
+ .name = "usage",
+ .read_u64 = nproc_read_usage,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys nproc_cgrp_subsys = {
+ .css_alloc = nproc_css_alloc,
+ .css_online = nproc_css_online,
+ .css_free = nproc_css_free,
+ .can_attach = nproc_can_attach,
+ .cancel_attach = nproc_cancel_attach,
+ .can_fork = nproc_can_fork,
+ .cancel_fork = nproc_cancel_fork,
+ .exit = nproc_exit,
+ .legacy_cftypes = files,
+ .early_init = 0,
+};
--
2.3.1
Hello,
On Mon, Feb 23, 2015 at 02:08:09PM +1100, Aleksa Sarai wrote:
> The current state of resource limitation for the number of open
> processes (as well as the number of open file descriptors) requires you
> to use setrlimit(2), which means that you are limited to resource
> limiting process trees rather than resource limiting cgroups (which is
> the point of cgroups).
>
> There was a patch to implement this in 2011[1], but that was rejected
> because it implemented a general-purpose rlimit subsystem -- which meant
> that you couldn't control distinct resource limits in different
> heirarchies. This patch implements a resource controller *specifically*
> for the number of processes in a cgroup, overcoming this issue.
>
> There has been a similar attempt to implement a resource controller for
> the number of open file descriptors[2], which has not been merged
> becasue the reasons were dubious. Merely from a "sane interface"
> perspective, it should be possible to utilise cgroups to do such
> rudimentary resource management (which currently only exists for process
> trees).
This isn't a proper resource to control. kmemcg just grew proper
reclaim support and will be useable to control kernel side of memory
consumption.
Thanks.
--
tejun
Tejun,
Am 27.02.2015 um 12:49 schrieb Tejun Heo:
> This isn't a proper resource to control. kmemcg just grew proper
> reclaim support and will be useable to control kernel side of memory
> consumption.
just to make sure that I understand the big picture.
The plan is to limit kernel memory per cgroup such that fork bombs and
stuff cannot harm other groups of processes?
Thanks,
//richard
Hello,
On Fri, Feb 27, 2015 at 02:46:13PM +0100, Richard Weinberger wrote:
> just to make sure that I understand the big picture.
> The plan is to limit kernel memory per cgroup such that fork bombs and
> stuff cannot harm other groups of processes?
Yes, the kmem part of memcg hasn't really been functional because the
reclaim part was broken and (partially conseqently) kmem config being
siloed from the rest but we're very close to solving that at this
point.
Thanks.
--
tejun
On 2015-02-27 06:49, Tejun Heo wrote:
> Hello,
>
> On Mon, Feb 23, 2015 at 02:08:09PM +1100, Aleksa Sarai wrote:
>> The current state of resource limitation for the number of open
>> processes (as well as the number of open file descriptors) requires you
>> to use setrlimit(2), which means that you are limited to resource
>> limiting process trees rather than resource limiting cgroups (which is
>> the point of cgroups).
>>
>> There was a patch to implement this in 2011[1], but that was rejected
>> because it implemented a general-purpose rlimit subsystem -- which meant
>> that you couldn't control distinct resource limits in different
>> heirarchies. This patch implements a resource controller *specifically*
>> for the number of processes in a cgroup, overcoming this issue.
>>
>> There has been a similar attempt to implement a resource controller for
>> the number of open file descriptors[2], which has not been merged
>> becasue the reasons were dubious. Merely from a "sane interface"
>> perspective, it should be possible to utilise cgroups to do such
>> rudimentary resource management (which currently only exists for process
>> trees).
>
> This isn't a proper resource to control. kmemcg just grew proper
> reclaim support and will be useable to control kernel side of memory
> consumption.
>
> Thanks.
>
Kernel memory consumption isn't the only valid reason to want to limit
the number of processes in a cgroup. Limiting the number of processes
is very useful to ensure that a program is working correctly (for
example, the NTP daemon should (usually) have an _exact_ number of
children if it is functioning correctly, and rpcbind shouldn't (AFAIK)
ever have _any_ children), to prevent PID number exhaustion, to head off
DoS attacks against forking network servers before they get to the point
of causing kmem exhaustion, and to limit the number of processes in a
cgroup that uses lots of kernel memory very infrequently.
Hello,
On Fri, Feb 27, 2015 at 11:42:10AM -0500, Austin S Hemmelgarn wrote:
> Kernel memory consumption isn't the only valid reason to want to limit the
> number of processes in a cgroup. Limiting the number of processes is very
> useful to ensure that a program is working correctly (for example, the NTP
> daemon should (usually) have an _exact_ number of children if it is
> functioning correctly, and rpcbind shouldn't (AFAIK) ever have _any_
> children), to prevent PID number exhaustion, to head off DoS attacks against
> forking network servers before they get to the point of causing kmem
> exhaustion, and to limit the number of processes in a cgroup that uses lots
> of kernel memory very infrequently.
All the use cases you're listing are extremely niche and can be
trivially achieved without introducing another cgroup controller. Not
only that, they're actually pretty silly. Let's say NTP daemon is
misbehaving (or its code changed w/o you knowing or there are corner
cases which trigger extremely infrequently). What do you exactly
achieve by rejecting its fork call? It's just adding another
variation to the misbehavior. It was misbehaving before and would now
be continuing to misbehave after a failed fork.
In general, I'm pretty strongly against adding controllers for things
which aren't fundamental resources in the system. What's next? Open
files? Pipe buffer? Number of flocks? Number of session leaders or
program groups?
If you want to prevent a certain class of jobs from exhausting a given
resource, protecting that resource is the obvious thing to do.
Thanks.
--
tejun
On Fri, Feb 27, 2015 at 8:42 AM, Austin S Hemmelgarn
<[email protected]> wrote:
> On 2015-02-27 06:49, Tejun Heo wrote:
>>
>> Hello,
>>
>> On Mon, Feb 23, 2015 at 02:08:09PM +1100, Aleksa Sarai wrote:
>>>
>>> The current state of resource limitation for the number of open
>>> processes (as well as the number of open file descriptors) requires you
>>> to use setrlimit(2), which means that you are limited to resource
>>> limiting process trees rather than resource limiting cgroups (which is
>>> the point of cgroups).
>>>
>>> There was a patch to implement this in 2011[1], but that was rejected
>>> because it implemented a general-purpose rlimit subsystem -- which meant
>>> that you couldn't control distinct resource limits in different
>>> heirarchies. This patch implements a resource controller *specifically*
>>> for the number of processes in a cgroup, overcoming this issue.
>>>
>>> There has been a similar attempt to implement a resource controller for
>>> the number of open file descriptors[2], which has not been merged
>>> becasue the reasons were dubious. Merely from a "sane interface"
>>> perspective, it should be possible to utilise cgroups to do such
>>> rudimentary resource management (which currently only exists for process
>>> trees).
>>
>>
>> This isn't a proper resource to control. kmemcg just grew proper
>> reclaim support and will be useable to control kernel side of memory
>> consumption.
I was told that the plan was to use kmemcg - but I was told that YEARS
AGO. In the mean time we all either do our own thing or we do nothing
and suffer.
Something like this is long overdue, IMO, and is still more
appropriate and obvious than kmemcg anyway.
>> Thanks.
>>
> Kernel memory consumption isn't the only valid reason to want to limit the
> number of processes in a cgroup. Limiting the number of processes is very
> useful to ensure that a program is working correctly (for example, the NTP
> daemon should (usually) have an _exact_ number of children if it is
> functioning correctly, and rpcbind shouldn't (AFAIK) ever have _any_
> children), to prevent PID number exhaustion, to head off DoS attacks against
> forking network servers before they get to the point of causing kmem
> exhaustion, and to limit the number of processes in a cgroup that uses lots
> of kernel memory very infrequently.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Fri, Feb 27, 2015 at 09:12:45AM -0800, Tim Hockin wrote:
> I was told that the plan was to use kmemcg - but I was told that YEARS
> AGO. In the mean time we all either do our own thing or we do nothing
> and suffer.
Wasn't it like a year ago? Yeah, it's taking longer than everybody
hoped but seriously kmemcg reclaimer just got merged and also did the
new memcg interface which will tie kmemcg and memcg together.
> Something like this is long overdue, IMO, and is still more
> appropriate and obvious than kmemcg anyway.
Thanks for chiming in again but if you aren't bringing out anything
new to the table (I don't remember you doing that last time either),
I'm not sure why the decision would be different this time.
Thanks.
--
tejun
On Fri, Feb 27, 2015 at 9:06 AM, Tejun Heo <[email protected]> wrote:
> Hello,
>
> On Fri, Feb 27, 2015 at 11:42:10AM -0500, Austin S Hemmelgarn wrote:
>> Kernel memory consumption isn't the only valid reason to want to limit the
>> number of processes in a cgroup. Limiting the number of processes is very
>> useful to ensure that a program is working correctly (for example, the NTP
>> daemon should (usually) have an _exact_ number of children if it is
>> functioning correctly, and rpcbind shouldn't (AFAIK) ever have _any_
>> children), to prevent PID number exhaustion, to head off DoS attacks against
>> forking network servers before they get to the point of causing kmem
>> exhaustion, and to limit the number of processes in a cgroup that uses lots
>> of kernel memory very infrequently.
>
> All the use cases you're listing are extremely niche and can be
> trivially achieved without introducing another cgroup controller. Not
> only that, they're actually pretty silly. Let's say NTP daemon is
> misbehaving (or its code changed w/o you knowing or there are corner
> cases which trigger extremely infrequently). What do you exactly
> achieve by rejecting its fork call? It's just adding another
> variation to the misbehavior. It was misbehaving before and would now
> be continuing to misbehave after a failed fork.
>
> In general, I'm pretty strongly against adding controllers for things
> which aren't fundamental resources in the system. What's next? Open
> files? Pipe buffer? Number of flocks? Number of session leaders or
> program groups?
Yes to some or all of those. We do exactly this internally and it has
greatly added to the stability of our overall container management
system. and while you have been telling everyone to wait for kmemcg,
we have had an extra 3+ years of stability.
> If you want to prevent a certain class of jobs from exhausting a given
> resource, protecting that resource is the obvious thing to do.
I don't follow your argument - isn't this exactly what this patch set
is doing - protecting resources?
> Wasn't it like a year ago? Yeah, it's taking longer than everybody
> hoped but seriously kmemcg reclaimer just got merged and also did the
> new memcg interface which will tie kmemcg and memcg together.
By my email it was almost 2 years ago, and that was the second or
third incarnation of this patch.
>> Something like this is long overdue, IMO, and is still more
>> appropriate and obvious than kmemcg anyway.
>
> Thanks for chiming in again but if you aren't bringing out anything
> new to the table (I don't remember you doing that last time either),
> I'm not sure why the decision would be different this time.
I'm just vocalizing my support for this idea in defense of practical
solutions that work NOW instead of "engineering ideals" that never
actually arrive.
As containers take the server world by storm, stuff like this gets
more and more important.
Tim
On Fri, Feb 27, 2015 at 09:25:10AM -0800, Tim Hockin wrote:
> > In general, I'm pretty strongly against adding controllers for things
> > which aren't fundamental resources in the system. What's next? Open
> > files? Pipe buffer? Number of flocks? Number of session leaders or
> > program groups?
>
> Yes to some or all of those. We do exactly this internally and it has
> greatly added to the stability of our overall container management
> system. and while you have been telling everyone to wait for kmemcg,
> we have had an extra 3+ years of stability.
Yeah, good job. I totally get why kernel part of memory consumption
needs protection. I'm not arguing against that at all.
> > If you want to prevent a certain class of jobs from exhausting a given
> > resource, protecting that resource is the obvious thing to do.
>
> I don't follow your argument - isn't this exactly what this patch set
> is doing - protecting resources?
If you have proper protection over kernel memory consumption, this is
completely covered because memory is the fundamental resource here.
Controlling distribution of those fundamental resources is what
cgroups are primarily about.
> > Wasn't it like a year ago? Yeah, it's taking longer than everybody
> > hoped but seriously kmemcg reclaimer just got merged and also did the
> > new memcg interface which will tie kmemcg and memcg together.
>
> By my email it was almost 2 years ago, and that was the second or
> third incarnation of this patch.
Again, I agree this is taking a while. Memory people had to retool
the whole reclamation path to make this work, which is the pattern
being repeated across the different controllers - we're refactoring a
lot of infrastructure code so that resource control can integrate with
the regular operation of the kernel, which BTW is what we should have
been doing from the beginning.
If your complaint is that this is taking too long, I hear you, and
there's a certain amount of validity in arguing that upstreaming a
temporary measure is the better trade-off, but the rationale for nproc
(or nfds, or virtual memory, whatever) has been pretty weak otherwise.
And as for the different incarnations of this patchset. Reposting the
same stuff repeatedly doesn't really change anything. Why would it?
> >> Something like this is long overdue, IMO, and is still more
> >> appropriate and obvious than kmemcg anyway.
> >
> > Thanks for chiming in again but if you aren't bringing out anything
> > new to the table (I don't remember you doing that last time either),
> > I'm not sure why the decision would be different this time.
>
> I'm just vocalizing my support for this idea in defense of practical
> solutions that work NOW instead of "engineering ideals" that never
> actually arrive.
>
> As containers take the server world by storm, stuff like this gets
> more and more important.
Again, protection of kernel side memory consumption is important.
There's no question about that. As for the never-arriving part, well,
it is arriving. If you still can't believe, just take a look at the
code.
Thanks.
--
tejun
On Fri, Feb 27, 2015 at 12:45:03PM -0500, Tejun Heo wrote:
> If your complaint is that this is taking too long, I hear you, and
> there's a certain amount of validity in arguing that upstreaming a
> temporary measure is the better trade-off, but the rationale for nproc
> (or nfds, or virtual memory, whatever) has been pretty weak otherwise.
Also, note that this is subset of a larger problem. e.g. there's a
patchset trying to implement writeback IO control from the filesystem
layer. cgroup control of writeback has been a thorny issue for over
three years now and the rationale for implementing this reversed
controlling scheme is about the same - doing it properly is too
difficult, let's bolt something on the top as a practical measure.
I think it'd be seriously short-sighted to give in and merge all
those. These sorts of shortcuts are crippling in the long term.
Again, similarly, proper cgroup writeback support is literally right
around the corner.
The situation sure can be frustrating if you need something now but we
can't make decisions solely on that. This is an a lot longer term
project and we better, for once, get things right.
Thanks.
--
tejun
On 2015-02-27 12:06, Tejun Heo wrote:
> Hello,
>
> On Fri, Feb 27, 2015 at 11:42:10AM -0500, Austin S Hemmelgarn wrote:
>> Kernel memory consumption isn't the only valid reason to want to limit the
>> number of processes in a cgroup. Limiting the number of processes is very
>> useful to ensure that a program is working correctly (for example, the NTP
>> daemon should (usually) have an _exact_ number of children if it is
>> functioning correctly, and rpcbind shouldn't (AFAIK) ever have _any_
>> children), to prevent PID number exhaustion, to head off DoS attacks against
>> forking network servers before they get to the point of causing kmem
>> exhaustion, and to limit the number of processes in a cgroup that uses lots
>> of kernel memory very infrequently.
>
> All the use cases you're listing are extremely niche and can be
> trivially achieved without introducing another cgroup controller. Not
> only that, they're actually pretty silly. Let's say NTP daemon is
> misbehaving (or its code changed w/o you knowing or there are corner
> cases which trigger extremely infrequently). What do you exactly
> achieve by rejecting its fork call? It's just adding another
> variation to the misbehavior. It was misbehaving before and would now
> be continuing to misbehave after a failed fork.
>
I wouldn't think that preventing PID exhaustion would be all that much
of a niche case, it's fully possible for it to happen without using
excessive amounts of kernel memory (think about BIG server systems with
terabytes of memory running (arguably poorly written) forking servers
that handle tens of thousands of client requests per second, each
lasting multiple tens of seconds), and not necessarily as trivial as you
might think to handle sanely (especially if you want callbacks when the
limits get hit).
As far as being trivial to achieve, I'm assuming you are referring to
rlimit and PAM's limits module, both of which have their own issues.
Using pam_limits.so to limit processes isn't trivial because it requires
calling through PAM to begin with, which almost no software that isn't
login related does, and rlimits are tricky to set up properly with the
granularity that having a cgroup would provide.
> In general, I'm pretty strongly against adding controllers for things
> which aren't fundamental resources in the system. What's next? Open
> files? Pipe buffer? Number of flocks? Number of session leaders or
> program groups?
>
PID's are a fundamental resource, you run out and it's an only
marginally better situation than OOM, namely, if you don't already have
a shell open which has kill builtin (because you can't fork), or have
some other reliable way to terminate processes without forking, you are
stuck either waiting for the problem to resolve itself, or have to reset
the system.
> If you want to prevent a certain class of jobs from exhausting a given
> resource, protecting that resource is the obvious thing to do.
>
Which is why I'm advocating something that provides a more robust method
of preventing the system from exhausting PID numbers.
> Thanks.
>
Hello, Austin.
On Fri, Feb 27, 2015 at 01:49:53PM -0500, Austin S Hemmelgarn wrote:
> As far as being trivial to achieve, I'm assuming you are referring to rlimit
> and PAM's limits module, both of which have their own issues. Using
> pam_limits.so to limit processes isn't trivial because it requires calling
> through PAM to begin with, which almost no software that isn't login related
> does, and rlimits are tricky to set up properly with the granularity that
> having a cgroup would provide.
...
> PID's are a fundamental resource, you run out and it's an only marginally
> better situation than OOM, namely, if you don't already have a shell open
> which has kill builtin (because you can't fork), or have some other reliable
> way to terminate processes without forking, you are stuck either waiting for
> the problem to resolve itself, or have to reset the system.
Right, this is an a lot more valid argument. Currently, we're capping
max pid at 4M which translates to some tens of gigs of memory which
isn't a crazy amount on modern machines. The hard(er) barrier would
be around 2^30 (2^29 from futex side, apparently) which would also be
reacheable on configurations w/ terabytes of memory.
I'll think more about it and get back.
Thanks a lot.
--
tejun
On Fri, Feb 27, 2015 at 9:45 AM, Tejun Heo <[email protected]> wrote:
> On Fri, Feb 27, 2015 at 09:25:10AM -0800, Tim Hockin wrote:
>> > In general, I'm pretty strongly against adding controllers for things
>> > which aren't fundamental resources in the system. What's next? Open
>> > files? Pipe buffer? Number of flocks? Number of session leaders or
>> > program groups?
>>
>> Yes to some or all of those. We do exactly this internally and it has
>> greatly added to the stability of our overall container management
>> system. and while you have been telling everyone to wait for kmemcg,
>> we have had an extra 3+ years of stability.
>
> Yeah, good job. I totally get why kernel part of memory consumption
> needs protection. I'm not arguing against that at all.
You keep shifting the focus to be about memory, but that's not what
people are asking for. You're letting the desire for a perfect
solution (which is years late) block good solutions that exist NOW.
>> > If you want to prevent a certain class of jobs from exhausting a given
>> > resource, protecting that resource is the obvious thing to do.
>>
>> I don't follow your argument - isn't this exactly what this patch set
>> is doing - protecting resources?
>
> If you have proper protection over kernel memory consumption, this is
> completely covered because memory is the fundamental resource here.
> Controlling distribution of those fundamental resources is what
> cgroups are primarily about.
You say that's what cgroups are about, but it's not at all obvious
that you are right. What users, admins, systems people want is
building blocks that are usable and make sense. Limiting kernel
memory is NOT the logical building block, here. It's not something
people can reason about or quantify easily. if you need to implement
the interfaces in terms of memory, go nuts, but making users think
liek that is just not right.
>> > Wasn't it like a year ago? Yeah, it's taking longer than everybody
>> > hoped but seriously kmemcg reclaimer just got merged and also did the
>> > new memcg interface which will tie kmemcg and memcg together.
>>
>> By my email it was almost 2 years ago, and that was the second or
>> third incarnation of this patch.
>
> Again, I agree this is taking a while. Memory people had to retool
> the whole reclamation path to make this work, which is the pattern
> being repeated across the different controllers - we're refactoring a
> lot of infrastructure code so that resource control can integrate with
> the regular operation of the kernel, which BTW is what we should have
> been doing from the beginning.
>
> If your complaint is that this is taking too long, I hear you, and
> there's a certain amount of validity in arguing that upstreaming a
> temporary measure is the better trade-off, but the rationale for nproc
> (or nfds, or virtual memory, whatever) has been pretty weak otherwise.
At least 3 or 4 people have INDEPENDENTLY decided this is what is
causing them pain and tried to fix it and invested the time to send a
patch says that it is actually a thing. There exists a problem that
you are disallowing to be fixed. Do you recognize that users are
experiencing pain? Why do you hate your users? :)
> And as for the different incarnations of this patchset. Reposting the
> same stuff repeatedly doesn't really change anything. Why would it?
Because reasonable people might survey the ecosystem and say "humm,
things have changed over the years - isolation has become a pretty
serious topic". or maybe they hope that you'll finally agree that
fixing the problem NOW is worthwhile, even if the solution is
imperfect, and that a more perfect solution will arrive.
>> >> Something like this is long overdue, IMO, and is still more
>> >> appropriate and obvious than kmemcg anyway.
>> >
>> > Thanks for chiming in again but if you aren't bringing out anything
>> > new to the table (I don't remember you doing that last time either),
>> > I'm not sure why the decision would be different this time.
>>
>> I'm just vocalizing my support for this idea in defense of practical
>> solutions that work NOW instead of "engineering ideals" that never
>> actually arrive.
>>
>> As containers take the server world by storm, stuff like this gets
>> more and more important.
>
> Again, protection of kernel side memory consumption is important.
> There's no question about that. As for the never-arriving part, well,
> it is arriving. If you still can't believe, just take a look at the
> code.
Are you willing to put a drop-dead date on it? If we don't have
kmemcg working well enough to _actually_ bound PID usage and FD usage
by, say, June 1st, will you then accept a patch to this effect? If
the answer is no, then I have zero faith that it's coming any time
soon - I heard this 2 years ago. I believed you then.
I see further downthread that you said you'll think about it. Thank
you. Just because our use cases are not normal does not mean we're
not valid :)
Tim
On Fri, Feb 27, 2015 at 01:45:09PM -0800, Tim Hockin wrote:
> Are you willing to put a drop-dead date on it? If we don't have
> kmemcg working well enough to _actually_ bound PID usage and FD usage
> by, say, June 1st, will you then accept a patch to this effect? If
> the answer is no, then I have zero faith that it's coming any time
> soon - I heard this 2 years ago. I believed you then.
Tim, cut this bullshit. That's not how kernel development works.
Contribute to techincal discussion or shut it. I'm really getting
tired of your whining without any useful substance.
> I see further downthread that you said you'll think about it. Thank
> you. Just because our use cases are not normal does not mean we're
> not valid :)
And can you even see why that made progress?
--
tejun
> I wouldn't think that preventing PID exhaustion would be all that much of a
> niche case, it's fully possible for it to happen without using excessive
> amounts of kernel memory (think about BIG server systems with terabytes of
> memory running (arguably poorly written) forking servers that handle tens of
> thousands of client requests per second, each lasting multiple tens of
> seconds), and not necessarily as trivial as you might think to handle sanely
> (especially if you want callbacks when the limits get hit).
> As far as being trivial to achieve, I'm assuming you are referring to rlimit
> and PAM's limits module, both of which have their own issues. Using
> pam_limits.so to limit processes isn't trivial because it requires calling
> through PAM to begin with, which almost no software that isn't login related
> does, and rlimits are tricky to set up properly with the granularity that
> having a cgroup would provide.
I just want to quickly echo my support for this statement. Process IDs
aren't limited by kernel memory, they're a hard-set limit. Thus they are
a resource like other global resources (open files, etc). Now, while you
can argue that it is possible to limit the amount of *effective*
processes you can use in a cgroup through kmemcg (by limiting the amount
of memory spent in storing task_struct data) -- that isn't limiting the
usage of the *actual* resource (the fact you're limiting the number of
PIDs is little more than a by-product).
Also, If it wasn't an actual resource then why is RLIMIT_NPROC a thing?
To me, that indicates that PID limiting not an esoteric usecase and it
should be possible to use the Linux kernel's home-grown accounting
system to limit the number of PIDs in a cgroup. Otherwise you're stuck
in a weird world where you *can* limit the number of processes in a
process tree but *not* the number of processes in a cgroup.
>> In general, I'm pretty strongly against adding controllers for things
>> which aren't fundamental resources in the system. What's next? Open
>> files? Pipe buffer? Number of flocks? Number of session leaders or
>> program groups?
>>
> PID's are a fundamental resource, you run out and it's an only marginally
> better situation than OOM, namely, if you don't already have a shell open
> which has kill builtin (because you can't fork), or have some other reliable
> way to terminate processes without forking, you are stuck either waiting for
> the problem to resolve itself, or have to reset the system.
I couldn't agree more. PIDs are a fundamental resource because there is
a hard limit on the amount of PIDs you can have in any one system. Once
you've exhausted that limit, there's not much you can do apart from
doing the SYSRQ dance.
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello, Aleksa.
On Sat, Feb 28, 2015 at 08:26:34PM +1100, Aleksa Sarai wrote:
> I just want to quickly echo my support for this statement. Process IDs
> aren't limited by kernel memory, they're a hard-set limit. Thus they are
Process IDs become a hard global resource because we didn't switch to
long during 64bit transition and put an artifical global limit on it,
which allows it to affect system-wide operation while its memory
consumption is staying within practical range.
> a resource like other global resources (open files, etc). Now, while you
Unlike open files.
> can argue that it is possible to limit the amount of *effective*
> processes you can use in a cgroup through kmemcg (by limiting the amount
> of memory spent in storing task_struct data) -- that isn't limiting the
> usage of the *actual* resource (the fact you're limiting the number of
> PIDs is little more than a by-product).
No, the problem is not that. The problem is that pid_t is, as a
resource, is decoupled from its backing resource - memory - by the
extra artificial and difficult-to-overcome limit put on it. You are
saying something which is completely different from what Austin was
arguing.
> Also, If it wasn't an actual resource then why is RLIMIT_NPROC a thing?
One strong reason would be because we didn't have a way to account for
and limit the fundamental resources. If you can fully contain and
control the consumption via rationing the underlying resource, there
isn't much point in controlling the upper layer constructs.
> To me, that indicates that PID limiting not an esoteric usecase and it
> should be possible to use the Linux kernel's home-grown accounting
> system to limit the number of PIDs in a cgroup. Otherwise you're stuck
Again, I think it's a lot more indicative of the fact that we didn't
have any way to control kernel side memory consumption and pids and
open files were one of the things which are relatively easy to
implement policy-wise.
> in a weird world where you *can* limit the number of processes in a
> process tree but *not* the number of processes in a cgroup.
I'm not sold on the idea of replicating the features of ulimit in
cgroups. ulimit is a mixed bag of relatively easily implementable
resource limits and their behaviors are a combination of resource
limits, per-user usage policies, and per-process behavior safetynets.
The only part translatable to cgroups is actual resource related part
and even among those we should identify what are actual resources
which can't be mapped to consumption of other fundamental resources.
> >> In general, I'm pretty strongly against adding controllers for things
> >> which aren't fundamental resources in the system. What's next? Open
> >> files? Pipe buffer? Number of flocks? Number of session leaders or
> >> program groups?
> >>
> > PID's are a fundamental resource, you run out and it's an only marginally
> > better situation than OOM, namely, if you don't already have a shell open
> > which has kill builtin (because you can't fork), or have some other reliable
> > way to terminate processes without forking, you are stuck either waiting for
> > the problem to resolve itself, or have to reset the system.
>
> I couldn't agree more. PIDs are a fundamental resource because there is
> a hard limit on the amount of PIDs you can have in any one system. Once
> you've exhausted that limit, there's not much you can do apart from
> doing the SYSRQ dance.
The reason why this holds is because we can hit the global limit way
earlier than a practically sized kmem consumption limits can kick in.
Thanks.
--
tejun
Hello, Tim.
On Sat, Feb 28, 2015 at 08:38:07AM -0800, Tim Hockin wrote:
> I know there is not much concern for legacy-system problems, but it is
> worth adding this case - there are systems that limit PIDs for other
> reasons, eg broken infrastructure that assumes PIDs fit in a short int,
> hypothetically. Given such a system, PIDs become precious and limiting
> them per job is important.
>
> My main point being that there are less obvious considerations in play than
> just memory usage.
Sure, there are those cases but it'd be unwise to hinge long term
decisions on them. It's hard to even argue 16bit pid in legacy code
as a significant contributing factor at this point. At any rate, it
seems that pid is a global resource which needs to be provisioned for
reasonable isolation which is a good reason to consider controlling it
via cgroups.
Thanks.
--
tejun
On Sat, Feb 28, 2015 at 08:48:12AM -0800, Tim Hockin wrote:
> I am sorry that real-user problems are not perceived as substantial. This
> was/is a real issue for us. Being in limbo for years on end might not be a
> technical point, but I do think it matters, and that was my point.
It's a problem which is localized to you and caused by the specific
problems of your setup. This isn't a wide-spread problem at all and
the world doesn't revolve around you. If your setup is so messed up
as to require sticking to 16bit pids, handle that locally. If
something at larger scale eases that handling, you get lucky. If not,
it's *your* predicament to deal with. The rest of the world doesn't
exist to wipe your ass.
--
tejun
On Sat, Feb 28, 2015 at 8:57 AM, Tejun Heo <[email protected]> wrote:
>
> On Sat, Feb 28, 2015 at 08:48:12AM -0800, Tim Hockin wrote:
> > I am sorry that real-user problems are not perceived as substantial. This
> > was/is a real issue for us. Being in limbo for years on end might not be a
> > technical point, but I do think it matters, and that was my point.
>
> It's a problem which is localized to you and caused by the specific
> problems of your setup. This isn't a wide-spread problem at all and
> the world doesn't revolve around you. If your setup is so messed up
> as to require sticking to 16bit pids, handle that locally. If
> something at larger scale eases that handling, you get lucky. If not,
> it's *your* predicament to deal with. The rest of the world doesn't
> exist to wipe your ass.
Wow, so much anger. I'm not even sure how to respond, so I'll just
say this and sign off. All I want is a better, friendlier, more
useful system overall. We clearly have different ways of looking at
the problem.
No antagonism intended
Tim
On Sat, Feb 28, 2015 at 02:26:58PM -0800, Tim Hockin wrote:
> Wow, so much anger. I'm not even sure how to respond, so I'll just
> say this and sign off. All I want is a better, friendlier, more
> useful system overall. We clearly have different ways of looking at
> the problem.
Can you communicate anything w/o passive aggression? If you have a
technical point, just state that. Can you at least agree that we
shouldn't be making design decisions based on 16bit pid_t?
--
tejun
On Sat, Feb 28, 2015 at 02:26:58PM -0800, Tim Hockin wrote:
> On Sat, Feb 28, 2015 at 8:57 AM, Tejun Heo <[email protected]> wrote:
> >
> > On Sat, Feb 28, 2015 at 08:48:12AM -0800, Tim Hockin wrote:
> > > I am sorry that real-user problems are not perceived as substantial. This
> > > was/is a real issue for us. Being in limbo for years on end might not be a
> > > technical point, but I do think it matters, and that was my point.
> >
> > It's a problem which is localized to you and caused by the specific
> > problems of your setup. This isn't a wide-spread problem at all and
> > the world doesn't revolve around you. If your setup is so messed up
> > as to require sticking to 16bit pids, handle that locally. If
> > something at larger scale eases that handling, you get lucky. If not,
> > it's *your* predicament to deal with. The rest of the world doesn't
> > exist to wipe your ass.
>
> Wow, so much anger.
Yeah, quite surprising after such an intellectually honest discussion:
: On Fri, Feb 27, 2015 at 01:45:09PM -0800, Tim Hockin wrote:
: > At least 3 or 4 people have INDEPENDENTLY decided this is what is
: > causing them pain and tried to fix it and invested the time to send a
: > patch says that it is actually a thing. There exists a problem that
: > you are disallowing to be fixed. Do you recognize that users are
: > experiencing pain? Why do you hate your users? :)
[...]
: > Are you willing to put a drop-dead date on it? If we don't have
: > kmemcg working well enough to _actually_ bound PID usage and FD usage
: > by, say, June 1st, will you then accept a patch to this effect? If
: > the answer is no, then I have zero faith that it's coming any time
: > soon - I heard this 2 years ago. I believed you then.
> I'm not even sure how to respond, so I'll just say this and sign
> off. All I want is a better, friendlier, more useful system
> overall. We clearly have different ways of looking at the problem.
Overlapping features and inconsistent userspace interfaces are only
better for the people that pick the hacks. They are the opposite of
friendly and useful. They are also horrible to maintain, which could
be a reason why you constantly disagree with the people that cleaned
up this unholy mess and are now trying to keep a balance between your
short term interests and the long-term health of the Linux kernel.
On Feb 28, 2015 2:50 PM, "Tejun Heo" <[email protected]> wrote:
>
> On Sat, Feb 28, 2015 at 02:26:58PM -0800, Tim Hockin wrote:
> > Wow, so much anger. I'm not even sure how to respond, so I'll just
> > say this and sign off. All I want is a better, friendlier, more
> > useful system overall. We clearly have different ways of looking at
> > the problem.
>
> Can you communicate anything w/o passive aggression? If you have a
> technical point, just state that. Can you at least agree that we
> shouldn't be making design decisions based on 16bit pid_t?
Hmm, I have screwed this thread up, I think. I've made some remarks
that did not come through with the proper tongue-in-cheek slant. I'm
not being passive aggressive - we DO look at this problem differently.
OF COURSE we should not make decisions based on ancient artifacts of
history. My point was that there are secondary considerations here -
PIDs are more than just the memory that backs them. They _ARE_ a
constrained resource, and you shouldn't assume the constraint is just
physical memory. It is a piece of policy that is outside the control
of the kernel proper - we handed those keys to userspace along time
ago.
Given that, I believe and have believed that the solution should model
the problem as the user perceives it - limiting PIDs - rather than
attaching to a solution-by-proxy.
Yes a solution here partially overlaps with kmemcg, but I don't think
that is a significant problem. They are different policies governing
behavior that may result in the same condition, but for very different
reasons. I do not think that is particularly bad for overall
comprehension, and I think the fact that this popped up yet again
indicates the existence of some nugget of user experience that is
worth paying consideration to.
I appreciate your promised consideration through a slightly refocused
lens. I will go back to my cave and do something I hope is more
productive and less antagonistic. I did not mean to bring out so much
vitriol.
Tim