This is an updated version of v9 of the pids patchset[1], with a few
small fixes:
* Fixed an off-by-one error in the SUBSYS_TAG macro that affects
intermediate __unused_* values (causing issues when merging tags into
the subsys enum)
* Updated the SUBSYS_TAG macro to be simpler (removed UNUSED_IDENT()
magic) and not use __COUNTER__.
* Removed the SUBSYS_TAG_COUNT macro and replaced it with a hard-coded
CGROUP_PREFORK_COUNT macro.
[1]: https://lkml.org/lkml/2015/4/11/237
Aleksa Sarai (4):
cgroups: use bitmask to filter for_each_subsys
cgroups: replace explicit ss_mask checking with for_each_subsys_which
cgroups: allow a cgroup subsystem to reject a fork
cgroups: implement the PIDs subsystem
include/linux/cgroup.h | 47 ++++--
include/linux/cgroup_subsys.h | 32 ++++
init/Kconfig | 16 ++
kernel/Makefile | 1 +
kernel/cgroup.c | 183 +++++++++++++++------
kernel/cgroup_freezer.c | 2 +-
kernel/cgroup_pids.c | 368 ++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 19 ++-
kernel/sched/core.c | 2 +-
9 files changed, 606 insertions(+), 64 deletions(-)
create mode 100644 kernel/cgroup_pids.c
--
2.3.5
Add a new macro for_each_subsys_which that allows all enabled cgroup
subsystems to be filtered by a bitmask, such that mask & (1 << ssid)
determines if the subsystem is to be processed in the loop body (where
ssid is the unique id of the subsystem).
Also replace the need_forkexit_callback with two separate bitmasks for
each callback to make (ss->{fork,exit}) checks unnecessary.
Signed-off-by: Aleksa Sarai <[email protected]>
---
kernel/cgroup.c | 43 ++++++++++++++++++++++++++-----------------
1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 29a7b2c..1d69a085 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -175,12 +175,14 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
*/
static u64 css_serial_nr_next = 1;
-/* This flag indicates whether tasks in the fork and exit paths should
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
* be called.
*/
-static int need_forkexit_callback __read_mostly;
+static int need_fork_callback __read_mostly;
+static int need_exit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -409,6 +411,19 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss_mask: the bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss_mask, ss, ssid) \
+ for_each_subsys((ss), (ssid)) \
+ if ((ss_mask) & (1 << (ssid)))
+
/* iterate across the hierarchies */
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
@@ -4932,7 +4947,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
- need_forkexit_callback |= ss->fork || ss->exit;
+ need_fork_callback |= (bool) ss->fork << ss->id;
+ need_exit_callback |= (bool) ss->exit << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -5239,11 +5255,8 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- if (need_forkexit_callback) {
- for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
- }
+ for_each_subsys_which(need_fork_callback, ss, i)
+ ss->fork(child);
}
/**
@@ -5287,16 +5300,12 @@ void cgroup_exit(struct task_struct *tsk)
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (need_forkexit_callback) {
- /* see cgroup_post_fork() for details */
- for_each_subsys(ss, i) {
- if (ss->exit) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ /* see cgroup_post_fork() for details */
+ for_each_subsys_which(need_exit_callback, ss, i) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- ss->exit(css, old_css, tsk);
- }
- }
+ ss->exit(css, old_css, tsk);
}
if (put_cset)
--
2.3.5
Replace the explicit checking against ss_masks inside a for_each_subsys
block with for_each_subsys_which(ss_mask, ...), to take advantage of the
more readable macro.
Signed-off-by: Aleksa Sarai <[email protected]>
---
kernel/cgroup.c | 42 ++++++++++++++----------------------------
1 file changed, 14 insertions(+), 28 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d69a085..abd491f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1099,9 +1099,8 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
while (true) {
unsigned int new_ss_mask = cur_ss_mask;
- for_each_subsys(ss, ssid)
- if (cur_ss_mask & (1 << ssid))
- new_ss_mask |= ss->depends_on;
+ for_each_subsys_which(cur_ss_mask, ss, ssid)
+ new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
@@ -1238,10 +1237,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(ss, ssid) {
- if (!(ss_mask & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss_mask, ss, ssid) {
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
@@ -1278,18 +1274,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys(ss, ssid)
- if (ss_mask & (1 << ssid))
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+ for_each_subsys_which(ss_mask, ss, ssid)
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- for_each_subsys(ss, ssid) {
+ for_each_subsys_which(ss_mask, ss, ssid) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
struct css_set *cset;
- if (!(ss_mask & (1 << ssid)))
- continue;
-
src_root = ss->root;
css = cgroup_css(&src_root->cgrp, ss);
@@ -2563,13 +2555,11 @@ static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
bool printed = false;
int ssid;
- for_each_subsys(ss, ssid) {
- if (ss_mask & (1 << ssid)) {
- if (printed)
- seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
- printed = true;
- }
+ for_each_subsys_which(ss_mask, ss, ssid) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
}
if (printed)
seq_putc(seq, '\n');
@@ -2719,9 +2709,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
while ((tok = strsep(&buf, " "))) {
if (tok[0] == '\0')
continue;
- for_each_subsys(ss, ssid) {
- if (ss->disabled || strcmp(tok + 1, ss->name) ||
- ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ for_each_subsys_which(~cgrp_dfl_root_inhibit_ss_mask, ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2808,10 +2797,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* still around. In such cases, wait till it's gone using
* offline_waitq.
*/
- for_each_subsys(ss, ssid) {
- if (!(css_enable & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(css_enable, ss, ssid) {
cgroup_for_each_live_child(child, cgrp) {
DEFINE_WAIT(wait);
--
2.3.5
Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected by a cgroup
policy. In addition, add a cancel_fork callback so that if an error
occurs later in the forking process, any state modified by can_fork can
be reverted.
Allow for a private opaque pointer to be passed from the cgroup_can_fork
to cgroup_post_fork, allowing for the fork state to be stored by each
subsystem separately.
In order for a subsystem to know that a task associated with a cgroup
hierarchy is being migrated to another hierarchy, add a detach callback
to the subsystem which is run after the migration has been confirmed but
before the old_cset's refcount is dropped. This is necessary in order
for a subsystem to be able to keep a proper count of how many tasks are
associated with that subsystem.
Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
enumerations to be be defined and used (as well as CGROUP_<TAG>_COUNT).
This is in preparation for implementing the pids cgroup subsystem.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup.h | 47 +++++++++++++-----
include/linux/cgroup_subsys.h | 27 +++++++++++
kernel/cgroup.c | 108 ++++++++++++++++++++++++++++++++++++++----
kernel/cgroup_freezer.c | 2 +-
kernel/fork.c | 19 +++++++-
kernel/sched/core.c | 2 +-
6 files changed, 181 insertions(+), 24 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c..fbdbe80 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -25,6 +25,19 @@
#ifdef CONFIG_CGROUPS
+/* define the enumeration of all cgroup subsystems */
+enum cgroup_subsys_id {
+#define SUBSYS(_x) _x ## _cgrp_id,
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+ __unused_tag_ ## _t = CGROUP_ ## _t - 1,
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS_TAG
+#undef SUBSYS
+ CGROUP_SUBSYS_COUNT,
+};
+
+#define CGROUP_PREFORK_COUNT (CGROUP_PREFORK_END - CGROUP_PREFORK_START)
+
struct cgroup_root;
struct cgroup_subsys;
struct cgroup;
@@ -32,7 +45,12 @@ struct cgroup;
extern int cgroup_init_early(void);
extern int cgroup_init(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+ void *ss_state[CGROUP_PREFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+ void *ss_state[CGROUP_PREFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+ void *old_ss_state[CGROUP_PREFORK_COUNT]);
extern void cgroup_exit(struct task_struct *p);
extern int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry);
@@ -40,14 +58,6 @@ extern int cgroupstats_build(struct cgroupstats *stats,
extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
-/* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _cgrp_id,
-enum cgroup_subsys_id {
-#include <linux/cgroup_subsys.h>
- CGROUP_SUBSYS_COUNT,
-};
-#undef SUBSYS
-
/*
* Per-subsystem/per-cgroup state maintained by the system. This is the
* fundamental structural building block that controllers deal with.
@@ -649,7 +659,11 @@ struct cgroup_subsys {
struct cgroup_taskset *tset);
void (*attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
- void (*fork)(struct task_struct *task);
+ void (*detach)(struct cgroup_subsys_state *old_css,
+ struct task_struct *task);
+ int (*can_fork)(struct task_struct *task, void **private);
+ void (*cancel_fork)(struct task_struct *task, void *private);
+ void (*fork)(struct task_struct *task, void *private);
void (*exit)(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task);
@@ -945,10 +959,21 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys_state;
+#define CGROUP_PREFORK_COUNT 0
+
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+ void *s[CGROUP_PREFORK_COUNT])
+{
+ return 0;
+}
+static inline void cgroup_cancel_fork(struct task_struct *p,
+ void *s[CGROUP_PREFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+ void *s[CGROUP_PREFORK_COUNT]) {}
+
static inline void cgroup_exit(struct task_struct *p) {}
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb..fdd3551 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,16 @@
*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
+#ifndef SUBSYS
+# define __TMP_SUBSYS
+# define SUBSYS(_x)
+#endif
+
+#ifndef SUBSYS_TAG
+# define __TMP_SUBSYS_TAG
+# define SUBSYS_TAG(_t)
+#endif
+
#if IS_ENABLED(CONFIG_CPUSETS)
SUBSYS(cpuset)
#endif
@@ -48,11 +58,28 @@ SUBSYS(hugetlb)
#endif
/*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(PREFORK_START)
+SUBSYS_TAG(PREFORK_END)
+
+/*
* The following subsystems are not supported on the default hierarchy.
*/
#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
SUBSYS(debug)
#endif
+
+#ifdef __TMP_SUBSYS
+# undef __TMP_SUBSYS
+# undef SUBSYS
+#endif
+
+#ifdef __TMP_SUBSYS_TAG
+# undef __TMP_SUBSYS_TAG
+# undef SUBSYS_TAG
+#endif
+
/*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index abd491f..122a823 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -176,14 +176,18 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
static u64 css_serial_nr_next = 1;
/*
- * These bitmask flags indicate whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+ * These bitmask flags indicate whether tasks in the fork and exit paths
+ * should check for fork/exit handlers to call. This avoids us having to do
+ * extra work in the fork/exit path if a subsystems doesn't need to be
+ * called.
*/
static int need_fork_callback __read_mostly;
static int need_exit_callback __read_mostly;
+/* Ditto for the can_fork/cancel_fork/reapply_fork callbacks. */
+static int need_canfork_callback __read_mostly;
+static int need_cancelfork_callback __read_mostly;
+
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -412,7 +416,7 @@ static int notify_on_release(const struct cgroup *cgrp)
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/**
- * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * for_each_subsys_which - filter for_each_subsys with a subsys bitmask
* @ss_mask: the bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -2054,6 +2058,8 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
struct css_set *new_cset)
{
struct css_set *old_cset;
+ struct cgroup_subsys_state *css;
+ int i;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_rwsem);
@@ -2078,6 +2084,18 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
/*
+ * We detach from the old_cset subsystems here. We must do this
+ * before we drop the refcount for old_cset, in order to make sure
+ * that nobody frees it underneath us.
+ */
+ for_each_e_css(css, i, old_cgrp) {
+ struct cgroup_subsys_state *old_css = old_cset->subsys[i];
+
+ if (old_css->ss->detach)
+ old_css->ss->detach(old_css, tsk);
+ }
+
+ /*
* We just gained a reference on old_cset by taking it from the
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
@@ -2321,9 +2339,10 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
*/
tset.csets = &tset.dst_csets;
- for_each_e_css(css, i, cgrp)
+ for_each_e_css(css, i, cgrp) {
if (css->ss->attach)
css->ss->attach(css, &tset);
+ }
ret = 0;
goto out_release_tset;
@@ -4935,6 +4954,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
need_fork_callback |= (bool) ss->fork << ss->id;
need_exit_callback |= (bool) ss->exit << ss->id;
+ need_canfork_callback |= (bool) ss->can_fork << ss->id;
+ need_cancelfork_callback |= (bool) ss->cancel_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -5188,6 +5209,68 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed.
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_state[CGROUP_PREFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, retval;
+
+ for_each_subsys_which(need_canfork_callback, ss, i) {
+ retval = ss->can_fork(child,
+ &ss_state[i - CGROUP_PREFORK_START]);
+ if (retval)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys_which(need_cancelfork_callback, ss, j) {
+ void *state = NULL;
+
+ if (j >= i)
+ break;
+
+ if (CGROUP_PREFORK_START <= j && j < CGROUP_PREFORK_END)
+ state = ss_state[j - CGROUP_PREFORK_START];
+
+ ss->cancel_fork(child, state);
+ }
+
+ return retval;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_state[CGROUP_PREFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys_which(need_cancelfork_callback, ss, i) {
+ void *state = NULL;
+
+ if (CGROUP_PREFORK_START <= i && i < CGROUP_PREFORK_END)
+ state = ss_state[i - CGROUP_PREFORK_START];
+
+ ss->cancel_fork(child, state);
+ }
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5197,7 +5280,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_state[CGROUP_PREFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5241,8 +5325,14 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- for_each_subsys_which(need_fork_callback, ss, i)
- ss->fork(child);
+ for_each_subsys_which(need_fork_callback, ss, i) {
+ void *state = NULL;
+
+ if (CGROUP_PREFORK_START <= i && i < CGROUP_PREFORK_END)
+ state = old_ss_state[i - CGROUP_PREFORK_START];
+
+ ss->fork(child, state);
+ }
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc..f1b30ad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..8281370 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1196,6 +1196,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ void *ss_state[CGROUP_PREFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1468,6 +1469,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+
+ /*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. If this fork is happening in an organization operation, then
+ * this will not charge the correct css_set. This is fixed during
+ * cgroup_post_fork() (when the css_set has been updated) by undoing
+ * this operation and forcefully charging the correct css_set.
+ */
+ retval = cgroup_can_fork(p, ss_state);
+ if (retval)
+ goto bad_fork_free_pid;
+
/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
@@ -1504,7 +1517,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1546,7 +1559,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, ss_state);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1556,6 +1569,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, ss_state);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f5..205b1cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7988,7 +7988,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
--
2.3.5
Adds a new single-purpose PIDs subsystem to limit the number of
tasks that can be forked inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that applies to a cgroup rather than a
process tree.
However, it should be noted that organisational operations (adding and
removing tasks from a PIDs hierarchy) will *not* be prevented. Rather,
the number of tasks in the hierarchy cannot exceed the limit through
forking. This is due to the fact that, in the unified hierarchy, attach
cannot fail (and it is not possible for a task to overcome its PIDs
cgroup policy limit by attaching to a child cgroup).
PIDs are fundamentally a global resource, and it is possible to reach
PID exhaustion inside a cgroup without hitting any reasonable kmemcg
policy. Once you've hit PID exhaustion, you're only in a marginally
better state than OOM. This subsystem allows PID exhaustion inside a
cgroup to be prevented.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup_subsys.h | 5 +
init/Kconfig | 16 ++
kernel/Makefile | 1 +
kernel/cgroup_pids.c | 368 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 390 insertions(+)
create mode 100644 kernel/cgroup_pids.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index fdd3551..fc61bc6 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,11 @@ SUBSYS(hugetlb)
* Subsystems that implement the can_fork() family of callbacks.
*/
SUBSYS_TAG(PREFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
SUBSYS_TAG(PREFORK_END)
/*
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d..1f135b7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -952,6 +952,22 @@ config CGROUP_FREEZER
Provides a way to freeze and unfreeze all tasks in a
cgroup.
+config CGROUP_PIDS
+ bool "PIDs cgroup subsystem"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs cgroup subsystem is designed to stop this from happening.
+
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
config CGROUP_DEVICE
bool "Device controller for cgroups"
help
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b33..e823592 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 0000000..c1c89f2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,368 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes
+ * from fork()ing after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting
+ * any kmemcg limits in place, PIDs are a fundamental resource.
+ * As such, PID exhaustion must be preventable in the scope of
+ * a cgroup hierarchy by allowing resource limiting of the
+ * number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number
+ * of tasks in pids.max (this is not available in the root cgroup
+ * for obvious reasons). The number of processes currently
+ * in the cgroup is given by pids.current. Organisational operations
+ * are not blocked by cgroup policies, so it is possible to have
+ * pids.current > pids.max. However, fork()s will still not work.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". fork()
+ * will return -EBUSY if forking would cause a cgroup policy to be
+ * violated.
+ *
+ * pids.current tracks all child cgroup hierarchies, so
+ * parent/pids.current is a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <[email protected]>
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * (PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0,
+ * because such a case is a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the pids controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot
+ * fail and the new pid count may exceed the limit, because
+ * organisational operations cannot fail in the unified hierarchy.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge
+ * would cause the new value to exceed the hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+ int64_t num = 0;
+
+ cgroup_taskset_for_each(task, tset)
+ num++;
+
+ /*
+ * Attaching to a cgroup is allowed to overcome the
+ * the PID limit, so that organisation operations aren't
+ * blocked by the `pids` cgroup controller.
+ */
+ pids_charge(pids, num);
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+ int64_t num = 0;
+
+ cgroup_taskset_for_each(task, tset)
+ num++;
+
+ pids_uncharge(pids, num);
+}
+
+static void pids_detach(struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ pids_uncharge(old_pids, 1);
+}
+
+static int pids_can_fork(struct task_struct *task, void **private)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int retval;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ rcu_read_lock();
+ css = task_css(current, pids_cgrp_id);
+ if (!css_tryget_online(css)) {
+ retval = -EBUSY;
+ goto err_rcu_unlock;
+ }
+ rcu_read_unlock();
+ pids = css_pids(css);
+
+ retval = pids_try_charge(pids, 1);
+ if (retval)
+ goto err_css_put;
+
+ *private = css;
+ return 0;
+
+err_rcu_unlock:
+ rcu_read_unlock();
+err_css_put:
+ css_put(css);
+ return retval;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *private)
+{
+ struct cgroup_subsys_state *css = private;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *private)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = private;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ /*
+ * Get the current task css. Since the task has already been exposed to
+ * the system and had its cg_list updated, we know that we already have
+ * an implicit reference through task.
+ */
+ rcu_read_lock();
+ css = task_css(task, pids_cgrp_id);
+ css_get(css);
+ rcu_read_unlock();
+
+ pids = css_pids(css);
+
+ /*
+ * The association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(old_css);
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ /* We use INT_MAX as the maximum value of pid_t. */
+ if (limit < 0 || limit > INT_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit == PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .detach = pids_detach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .exit = pids_exit,
+ .legacy_cftypes = files,
+ .early_init = 0,
+};
--
2.3.5
Hello, Aleksa.
On Sun, Apr 19, 2015 at 10:22:31PM +1000, Aleksa Sarai wrote:
> -static int need_forkexit_callback __read_mostly;
> +static int need_fork_callback __read_mostly;
> +static int need_exit_callback __read_mostly;
These are bitmasks now, right? Let's make them unsigned int.
> static struct cftype cgroup_dfl_base_files[];
> +#define for_each_subsys_which(ss_mask, ss, ssid) \
> + for_each_subsys((ss), (ssid)) \
> + if ((ss_mask) & (1 << (ssid)))
Maybe using for_each_set_bit() is better?
#define for_each_subsys_which(ss_mask, ss, ssid) \
for_each_set_bit(ssid, &(ss_mask), CGROUP_SUBSYS_COUNT) \
if ((ss) = group_subsys[ssid] && false) \
; \
else
> @@ -4932,7 +4947,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
> * init_css_set is in the subsystem's root cgroup. */
> init_css_set.subsys[ss->id] = css;
>
> - need_forkexit_callback |= ss->fork || ss->exit;
> + need_fork_callback |= (bool) ss->fork << ss->id;
> + need_exit_callback |= (bool) ss->exit << ss->id;
^
please drop the space here
Other than the above minor points, looks good to me.
Thanks for the persistence.
--
tejun
Ooh, just one more thing.
On Sun, Apr 19, 2015 at 10:22:31PM +1000, Aleksa Sarai wrote:
> +/**
> + * for_each_subsys_which - filter for_each_subsys with a bitmask
> + * @ss_mask: the bitmask
> + * @ss: the iteration cursor
> + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
> + *
> + * The block will only run for cases where the ssid-th bit (1 << ssid) of
> + * mask is set to 1.
> + */
> +#define for_each_subsys_which(ss_mask, ss, ssid) \
> + for_each_subsys((ss), (ssid)) \
> + if ((ss_mask) & (1 << (ssid)))
> +
This isn't completely consistent but we tend to put the cursors in
front of what's being iterated. e.g.
for_each_css(css, ssid, cgrp)
for_each_set_bit(bit, addr, size)
Following the pattern, for_each_subsys_which() prolly should do
for_each_subsys_which(ss, ssid, ss_mask)
rather than the other way around.
Thanks.
--
tejun
On Sun, Apr 19, 2015 at 10:22:32PM +1000, Aleksa Sarai wrote:
> Replace the explicit checking against ss_masks inside a for_each_subsys
> block with for_each_subsys_which(ss_mask, ...), to take advantage of the
> more readable macro.
>
> Signed-off-by: Aleksa Sarai <[email protected]>
Looks good to me.
Thanks.
--
tejun
On Wed, Apr 22, 2015 at 11:25:51AM -0400, Tejun Heo wrote:
> Hello, Aleksa.
>
> On Sun, Apr 19, 2015 at 10:22:31PM +1000, Aleksa Sarai wrote:
> > -static int need_forkexit_callback __read_mostly;
> > +static int need_fork_callback __read_mostly;
> > +static int need_exit_callback __read_mostly;
>
> These are bitmasks now, right? Let's make them unsigned int.
If, as per the below you want to use the bitmap ops; it needs be
unsigned long.
> > static struct cftype cgroup_dfl_base_files[];
> > +#define for_each_subsys_which(ss_mask, ss, ssid) \
> > + for_each_subsys((ss), (ssid)) \
> > + if ((ss_mask) & (1 << (ssid)))
>
> Maybe using for_each_set_bit() is better?
>
> #define for_each_subsys_which(ss_mask, ss, ssid) \
> for_each_set_bit(ssid, &(ss_mask), CGROUP_SUBSYS_COUNT) \
> if ((ss) = group_subsys[ssid] && false) \
> ; \
> else
Clever that ;-)
On Sun, Apr 19, 2015 at 10:22:33PM +1000, Aleksa Sarai wrote:
> @@ -25,6 +25,19 @@
>
> #ifdef CONFIG_CGROUPS
>
> +/* define the enumeration of all cgroup subsystems */
> +enum cgroup_subsys_id {
> +#define SUBSYS(_x) _x ## _cgrp_id,
> +#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
> + __unused_tag_ ## _t = CGROUP_ ## _t - 1,
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS_TAG
> +#undef SUBSYS
> + CGROUP_SUBSYS_COUNT,
> +};
It's generally a good idea to not combine code movement and
modification as it tends to obfuscate what's going on. Can you please
create a prep patch to move the chunk above?
> +#define CGROUP_PREFORK_COUNT (CGROUP_PREFORK_END - CGROUP_PREFORK_START)
Is it prefork or can_fork? Given post_fork, maybe pre_fork is a more
consistent name?
> +
> struct cgroup_root;
> struct cgroup_subsys;
> struct cgroup;
> @@ -32,7 +45,12 @@ struct cgroup;
> extern int cgroup_init_early(void);
> extern int cgroup_init(void);
> extern void cgroup_fork(struct task_struct *p);
> -extern void cgroup_post_fork(struct task_struct *p);
> +extern int cgroup_can_fork(struct task_struct *p,
> + void *ss_state[CGROUP_PREFORK_COUNT]);
> +extern void cgroup_cancel_fork(struct task_struct *p,
> + void *ss_state[CGROUP_PREFORK_COUNT]);
> +extern void cgroup_post_fork(struct task_struct *p,
> + void *old_ss_state[CGROUP_PREFORK_COUNT]);
Also, why are they named @ss_state when the param to the callbacks are
@private? Wouldn't ss_private[] be more consistent?
> @@ -945,10 +959,21 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
>
> struct cgroup_subsys_state;
>
> +#define CGROUP_PREFORK_COUNT 0
> +
> static inline int cgroup_init_early(void) { return 0; }
> static inline int cgroup_init(void) { return 0; }
> static inline void cgroup_fork(struct task_struct *p) {}
> -static inline void cgroup_post_fork(struct task_struct *p) {}
> +static inline int cgroup_can_fork(struct task_struct *p,
> + void *s[CGROUP_PREFORK_COUNT])
> +{
> + return 0;
> +}
Style consistency?
> +static inline void cgroup_cancel_fork(struct task_struct *p,
> + void *s[CGROUP_PREFORK_COUNT]) {}
> +static inline void cgroup_post_fork(struct task_struct *p,
> + void *s[CGROUP_PREFORK_COUNT]) {}
Why are the arguments named @s here?
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index e4a96fb..fdd3551 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -3,6 +3,16 @@
> *
> * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
> */
> +#ifndef SUBSYS
> +# define __TMP_SUBSYS
> +# define SUBSYS(_x)
> +#endif
Does it ever make sense to include cgroup_subsys.h w/o SUBSYS macro?
...
> + * These bitmask flags indicate whether tasks in the fork and exit paths
> + * should check for fork/exit handlers to call. This avoids us having to do
> + * extra work in the fork/exit path if a subsystems doesn't need to be
> + * called.
> */
> static int need_fork_callback __read_mostly;
> static int need_exit_callback __read_mostly;
This comment belongs to an earlier patch but need_ is a bit misnomer
at this point. Shouldn't it be more like have_fork_callback?
>
> +/* Ditto for the can_fork/cancel_fork/reapply_fork callbacks. */
> +static int need_canfork_callback __read_mostly;
> +static int need_cancelfork_callback __read_mostly;
And given that the reason we have these masks is avoiding iteration in
relatively hot paths. Does cancelfork mask make sense?
> @@ -412,7 +416,7 @@ static int notify_on_release(const struct cgroup *cgrp)
> (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
>
> /**
> - * for_each_subsys_which - filter for_each_subsys with a bitmask
> + * for_each_subsys_which - filter for_each_subsys with a subsys bitmask
Does this chunk belong to this patch?
> @@ -2078,6 +2084,18 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
> list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
>
> /*
> + * We detach from the old_cset subsystems here. We must do this
> + * before we drop the refcount for old_cset, in order to make sure
> + * that nobody frees it underneath us.
> + */
> + for_each_e_css(css, i, old_cgrp) {
> + struct cgroup_subsys_state *old_css = old_cset->subsys[i];
> +
> + if (old_css->ss->detach)
> + old_css->ss->detach(old_css, tsk);
> + }
I don't get this. What can ->detach do that ->can_attach cannot?
> +void cgroup_cancel_fork(struct task_struct *child,
> + void *ss_state[CGROUP_PREFORK_COUNT])
> +{
> + struct cgroup_subsys *ss;
> + int i;
> +
> + for_each_subsys_which(need_cancelfork_callback, ss, i) {
> + void *state = NULL;
> +
> + if (CGROUP_PREFORK_START <= i && i < CGROUP_PREFORK_END)
> + state = ss_state[i - CGROUP_PREFORK_START];
Maybe we want a helper callback which does
if (CGROUP_PREFORK_START <= ssid && ssid < CGROUP_PREFORK_END)
return &ss_state[ssid - CGROUP_PREFORK_START];
return NULL;
> +
> + ss->cancel_fork(child, state);
> + }
> +}
> @@ -1196,6 +1196,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> {
> int retval;
> struct task_struct *p;
> + void *ss_state[CGROUP_PREFORK_COUNT] = {};
Please use a name which signifies that this is for cgroups.
> @@ -1468,6 +1469,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> INIT_LIST_HEAD(&p->thread_group);
> p->task_works = NULL;
>
> +
> + /*
> + * Ensure that the cgroup subsystem policies allow the new process to be
> + * forked. If this fork is happening in an organization operation, then
> + * this will not charge the correct css_set. This is fixed during
> + * cgroup_post_fork() (when the css_set has been updated) by undoing
> + * this operation and forcefully charging the correct css_set.
> + */
I'm not sure the above description is appropriate for copy_process().
>From copy_process()'s perspective, it doesn't matter what goes on
internally and the behavior being described is pretty specific to the
way the pids controller is implemented.
Thanks.
--
tejun
Hey,
On Wed, Apr 22, 2015 at 05:42:12PM +0200, Peter Zijlstra wrote:
> If, as per the below you want to use the bitmap ops; it needs be
> unsigned long.
Ah, right.
> > > static struct cftype cgroup_dfl_base_files[];
> > > +#define for_each_subsys_which(ss_mask, ss, ssid) \
> > > + for_each_subsys((ss), (ssid)) \
> > > + if ((ss_mask) & (1 << (ssid)))
> >
> > Maybe using for_each_set_bit() is better?
> >
> > #define for_each_subsys_which(ss_mask, ss, ssid) \
> > for_each_set_bit(ssid, &(ss_mask), CGROUP_SUBSYS_COUNT) \
> > if ((ss) = group_subsys[ssid] && false) \
> > ; \
> > else
>
> Clever that ;-)
Thanks. It kinda bothers me that for_each_set_bit() doesn't collapse
to combo of ffs() + clearing bit off of a temp mask when size is const
and <= ulong, which would be quite a bit lighter. Right now it'd be
calling into generic find_first/next_bit() functions unconditionally.
Ah well, it can be optimized later.
--
tejun
> @@ -0,0 +1,368 @@
> +/*
> + * Process number limiting controller for cgroups.
> + *
> + * Used to allow a cgroup hierarchy to stop any new processes
> + * from fork()ing after a certain limit is reached.
> + *
> + * Since it is trivial to hit the task limit without hitting
> + * any kmemcg limits in place, PIDs are a fundamental resource.
> + * As such, PID exhaustion must be preventable in the scope of
> + * a cgroup hierarchy by allowing resource limiting of the
> + * number of tasks in a cgroup.
> + *
> + * In order to use the `pids` controller, set the maximum number
> + * of tasks in pids.max (this is not available in the root cgroup
> + * for obvious reasons). The number of processes currently
> + * in the cgroup is given by pids.current. Organisational operations
> + * are not blocked by cgroup policies, so it is possible to have
> + * pids.current > pids.max. However, fork()s will still not work.
> + *
> + * To set a cgroup to have no limit, set pids.max to "max". fork()
> + * will return -EBUSY if forking would cause a cgroup policy to be
> + * violated.
> + *
> + * pids.current tracks all child cgroup hierarchies, so
> + * parent/pids.current is a superset of parent/child/pids.current.
> + *
> + * Copyright (C) 2015 Aleksa Sarai <[email protected]>
The above text looks wrapped too narrow.
> +struct pids_cgroup {
> + struct cgroup_subsys_state css;
> +
> + /*
> + * Use 64-bit types so that we can safely represent "max" as
> + * (PID_MAX_LIMIT + 1).
^^^^^^^^^^^^^^^^^
...
> +static struct cgroup_subsys_state *
> +pids_css_alloc(struct cgroup_subsys_state *parent)
> +{
> + struct pids_cgroup *pids;
> +
> + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
> + if (!pids)
> + return ERR_PTR(-ENOMEM);
> +
> + pids->limit = PIDS_MAX;
^^^^^^^^^
> + atomic64_set(&pids->counter, 0);
> + return &pids->css;
> +}
...
> +static void pids_detach(struct cgroup_subsys_state *old_css,
> + struct task_struct *task)
> +{
> + struct pids_cgroup *old_pids = css_pids(old_css);
> +
> + pids_uncharge(old_pids, 1);
> +}
You can do the above as a part of can/cancel.
> +static int pids_can_fork(struct task_struct *task, void **private)
Maybe @priv_p or something which signifies it's of different type from
others?
> +{
...
> + rcu_read_lock();
> + css = task_css(current, pids_cgrp_id);
> + if (!css_tryget_online(css)) {
> + retval = -EBUSY;
> + goto err_rcu_unlock;
> + }
> + rcu_read_unlock();
Hmmm... so, the above is guaranteed to succeed in finite amount of
time (the race window is actually very narrow) and it'd be silly to
fail fork because a task was being moved across cgroups.
I think it'd be a good idea to implement task_get_css() which loops
and returns the current css for the requested subsystem with reference
count bumped and it can use css_tryget() too. Holding a ref doesn't
prevent css from dying anyway, so it doesn't make any difference.
> +static void pids_fork(struct task_struct *task, void *private)
> +{
...
> + rcu_read_lock();
> + css = task_css(task, pids_cgrp_id);
> + css_get(css);
Why is this safe? What guarantees that css's ref isn't already zero
at this point?
> + rcu_read_unlock();
> +
> + pids = css_pids(css);
> +
> + /*
> + * The association has changed, we have to revert and reapply the
> + * charge/uncharge on the wrong hierarchy to the current one. Since
> + * the association can only change due to an organisation event, its
> + * okay for us to ignore the limit in this case.
> + */
> + if (pids != old_pids) {
> + pids_uncharge(old_pids, 1);
> + pids_charge(pids, 1);
> + }
> +
> + css_put(css);
> + css_put(old_css);
> +}
...
> +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
> + size_t nbytes, loff_t off)
> +{
> + struct cgroup_subsys_state *css = of_css(of);
> + struct pids_cgroup *pids = css_pids(css);
> + int64_t limit;
> + int err;
> +
> + buf = strstrip(buf);
> + if (!strcmp(buf, PIDS_MAX_STR)) {
> + limit = PIDS_MAX;
> + goto set_limit;
> + }
> +
> + err = kstrtoll(buf, 0, &limit);
> + if (err)
> + return err;
> +
> + /* We use INT_MAX as the maximum value of pid_t. */
> + if (limit < 0 || limit > INT_MAX)
This is kinda weird if we're using PIDS_MAX for max as it may end up
showing "max" after some larger number is written to the file.
Thanks.
--
tejun
Hi Tejun,
>> + rcu_read_lock();
>> + css = task_css(current, pids_cgrp_id);
>> + if (!css_tryget_online(css)) {
>> + retval = -EBUSY;
>> + goto err_rcu_unlock;
>> + }
>> + rcu_read_unlock();
>
> Hmmm... so, the above is guaranteed to succeed in finite amount of
> time (the race window is actually very narrow) and it'd be silly to
> fail fork because a task was being moved across cgroups.
>
> I think it'd be a good idea to implement task_get_css() which loops
> and returns the current css for the requested subsystem with reference
> count bumped and it can use css_tryget() too. Holding a ref doesn't
> prevent css from dying anyway, so it doesn't make any difference.
Hmmm, okay. I'll work on this later.
>> + rcu_read_lock();
>> + css = task_css(task, pids_cgrp_id);
>> + css_get(css);
>
> Why is this safe? What guarantees that css's ref isn't already zero
> at this point?
Because it's already been exposed by pids_fork, so the current css_set
(which contains the current css)'s ref has been bumped. There isn't a
guarantee that there is a ref to css, but there is a guarantee the
css_set it is in has a ref. The problem with using tryget is that we
can't fail here.
>> +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
>> + size_t nbytes, loff_t off)
>> +{
>> + struct cgroup_subsys_state *css = of_css(of);
>> + struct pids_cgroup *pids = css_pids(css);
>> + int64_t limit;
>> + int err;
>> +
>> + buf = strstrip(buf);
>> + if (!strcmp(buf, PIDS_MAX_STR)) {
>> + limit = PIDS_MAX;
>> + goto set_limit;
>> + }
>> +
>> + err = kstrtoll(buf, 0, &limit);
>> + if (err)
>> + return err;
>> +
>> + /* We use INT_MAX as the maximum value of pid_t. */
>> + if (limit < 0 || limit > INT_MAX)
>
> This is kinda weird if we're using PIDS_MAX for max as it may end up
> showing "max" after some larger number is written to the file.
The reason for this is because I believe you said "PIDS_MAX isn't
meant to be exposed to userspace" (one of the previous patchsets used
PIDS_MAX as the maximum valid value).
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hey,
>> +#define CGROUP_PREFORK_COUNT 0
>> +
>> static inline int cgroup_init_early(void) { return 0; }
>> static inline int cgroup_init(void) { return 0; }
>> static inline void cgroup_fork(struct task_struct *p) {}
>> -static inline void cgroup_post_fork(struct task_struct *p) {}
>> +static inline int cgroup_can_fork(struct task_struct *p,
>> + void *s[CGROUP_PREFORK_COUNT])
>> +{
>> + return 0;
>> +}
>
> Style consistency?
It's because it wraps. I can move it to be something like:
static inline int cgroup_can_fork(struct task_struct *p,
void *s[CGROUP_PREFORK_COUNT])
{ return 0; }
If you like.
>> @@ -2078,6 +2084,18 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
>> list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
>>
>> /*
>> + * We detach from the old_cset subsystems here. We must do this
>> + * before we drop the refcount for old_cset, in order to make sure
>> + * that nobody frees it underneath us.
>> + */
>> + for_each_e_css(css, i, old_cgrp) {
>> + struct cgroup_subsys_state *old_css = old_cset->subsys[i];
>> +
>> + if (old_css->ss->detach)
>> + old_css->ss->detach(old_css, tsk);
>> + }
>
> I don't get this. What can ->detach do that ->can_attach cannot?
->detach signifies that a task is being migrated away from a cgroup.
On second thought, we could just use task_css() on each task in the
tset to figure out what the cgroup the task is being migrated away
from is and just uncharge that inside ->can_attach.
On the same point, are all the tasks in a tset passed to ->can_attach
guaranteed to have the same css? Or do I have to uncharge each one
individually?
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
>>> + rcu_read_lock();
>>> + css = task_css(current, pids_cgrp_id);
>>> + if (!css_tryget_online(css)) {
>>> + retval = -EBUSY;
>>> + goto err_rcu_unlock;
>>> + }
>>> + rcu_read_unlock();
>>
>> Hmmm... so, the above is guaranteed to succeed in finite amount of
>> time (the race window is actually very narrow) and it'd be silly to
>> fail fork because a task was being moved across cgroups.
>>
>> I think it'd be a good idea to implement task_get_css() which loops
>> and returns the current css for the requested subsystem with reference
>> count bumped and it can use css_tryget() too. Holding a ref doesn't
>> prevent css from dying anyway, so it doesn't make any difference.
>
> Hmmm, okay. I'll work on this later.
Would something like this suffice?
struct cgroup_subsys_state *task_get_css(struct task_struct *task, int
subsys_id) {
bool have_ref = false;
struct cgroup_subsys_state *css;
while(!have_ref) {
rcu_read_lock();
css = task_css(task, subsys_id);
have_ref = !css_tryget(css);
rcu_read_unlock();
}
return css;
}
Also, as a side note (in the same vein I guess), does a ref on a
css_set give you an implicit ref on a css inside that css_set, or are
those two orthogonal operations?
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Also,
>> +struct pids_cgroup {
>> + struct cgroup_subsys_state css;
>> +
>> + /*
>> + * Use 64-bit types so that we can safely represent "max" as
>> + * (PID_MAX_LIMIT + 1).
> ^^^^^^^^^^^^^^^^^
> ...
>> +static struct cgroup_subsys_state *
>> +pids_css_alloc(struct cgroup_subsys_state *parent)
>> +{
>> + struct pids_cgroup *pids;
>> +
>> + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
>> + if (!pids)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + pids->limit = PIDS_MAX;
> ^^^^^^^^^
%PIDS_MAX = (%PID_MAX_LIMIT + 1). I can clarify this in the comments
if you want.
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello, Aleksa.
On Sat, Apr 25, 2015 at 12:07:34AM +1000, Aleksa Sarai wrote:
> Would something like this suffice?
>
> struct cgroup_subsys_state *task_get_css(struct task_struct *task, int
> subsys_id) {
> bool have_ref = false;
> struct cgroup_subsys_state *css;
>
> while(!have_ref) {
> rcu_read_lock();
> css = task_css(task, subsys_id);
> have_ref = !css_tryget(css);
> rcu_read_unlock();
> }
>
> return css;
> }
I was thinking why this felt so familiar and realized that I have the
patch pending.
http://lkml.kernel.org/g/[email protected]
Please feel free to include it in the patch series. I'll sort out the
merging later.
> Also, as a side note (in the same vein I guess), does a ref on a
> css_set give you an implicit ref on a css inside that css_set, or are
> those two orthogonal operations?
Yes, it does, but if you're gonna depend on that, please document that.
Thanks.
--
tejun
Hello,
On Thu, Apr 23, 2015 at 10:43:12AM +1000, Aleksa Sarai wrote:
> > Why is this safe? What guarantees that css's ref isn't already zero
> > at this point?
>
> Because it's already been exposed by pids_fork, so the current css_set
But what prevents against the task being migrated to a different
cgroup?
> (which contains the current css)'s ref has been bumped. There isn't a
> guarantee that there is a ref to css, but there is a guarantee the
> css_set it is in has a ref. The problem with using tryget is that we
> can't fail here.
The guarantee you have there is the css_set wouldn't go away until rcu
lock is dropped and you can deref csses from it. The way it's
currently implemented, you're guaranteed to have references to the
csses but that's sort of implementation detail. It can be implemented
in different ways.
A task, as long as it's alive, is guaranteed to have a css associated
with it all the time. What the tryget protects is races against the
task being migrated to a different cgroup, so retrying until success
is guaranteed to finish in a short amount of time.
> >> + /* We use INT_MAX as the maximum value of pid_t. */
> >> + if (limit < 0 || limit > INT_MAX)
> >
> > This is kinda weird if we're using PIDS_MAX for max as it may end up
> > showing "max" after some larger number is written to the file.
>
> The reason for this is because I believe you said "PIDS_MAX isn't
> meant to be exposed to userspace" (one of the previous patchsets used
> PIDS_MAX as the maximum valid value).
Yeah, but wouldn't it be weird to allow the userland to input PIDS_MAX
(whatever value that may be) and reads back max? It can be whatever
maximum input value + 1, no?
Thanks.
--
tejun
On Fri, Apr 24, 2015 at 11:59:31PM +1000, Aleksa Sarai wrote:
> Hey,
>
> >> +#define CGROUP_PREFORK_COUNT 0
> >> +
> >> static inline int cgroup_init_early(void) { return 0; }
> >> static inline int cgroup_init(void) { return 0; }
> >> static inline void cgroup_fork(struct task_struct *p) {}
> >> -static inline void cgroup_post_fork(struct task_struct *p) {}
> >> +static inline int cgroup_can_fork(struct task_struct *p,
> >> + void *s[CGROUP_PREFORK_COUNT])
> >> +{
> >> + return 0;
> >> +}
> >
> > Style consistency?
>
> It's because it wraps. I can move it to be something like:
>
> static inline int cgroup_can_fork(struct task_struct *p,
> void *s[CGROUP_PREFORK_COUNT])
> { return 0; }
static inline int cgroup_can_fork(struct task_struct *p,
void *s[CGROUP_PREFORK_COUNT])
{ return 0; }
> >> @@ -2078,6 +2084,18 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
> >> list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
> >>
> >> /*
> >> + * We detach from the old_cset subsystems here. We must do this
> >> + * before we drop the refcount for old_cset, in order to make sure
> >> + * that nobody frees it underneath us.
> >> + */
> >> + for_each_e_css(css, i, old_cgrp) {
> >> + struct cgroup_subsys_state *old_css = old_cset->subsys[i];
> >> +
> >> + if (old_css->ss->detach)
> >> + old_css->ss->detach(old_css, tsk);
> >> + }
> >
> > I don't get this. What can ->detach do that ->can_attach cannot?
>
> ->detach signifies that a task is being migrated away from a cgroup.
> On second thought, we could just use task_css() on each task in the
> tset to figure out what the cgroup the task is being migrated away
> from is and just uncharge that inside ->can_attach.
Yeah, that's how it's supposed to be used. You have access to both
pre and post csses before migration and only to the new ones once by
the time ->attach() is running.
> On the same point, are all the tasks in a tset passed to ->can_attach
> guaranteed to have the same css? Or do I have to uncharge each one
> individually?
They're the same.
Thanks.
--
tejun
Hey,
>> > > static struct cftype cgroup_dfl_base_files[];
>> > > +#define for_each_subsys_which(ss_mask, ss, ssid) \
>> > > + for_each_subsys((ss), (ssid)) \
>> > > + if ((ss_mask) & (1 << (ssid)))
>> >
>> > Maybe using for_each_set_bit() is better?
>> >
>> > #define for_each_subsys_which(ss_mask, ss, ssid) \
>> > for_each_set_bit(ssid, &(ss_mask), CGROUP_SUBSYS_COUNT) \
>> > if ((ss) = group_subsys[ssid] && false) \
>> > ; \
>> > else
>>
>> Clever that ;-)
>
> Thanks. It kinda bothers me that for_each_set_bit() doesn't collapse
> to combo of ffs() + clearing bit off of a temp mask when size is const
> and <= ulong, which would be quite a bit lighter. Right now it'd be
> calling into generic find_first/next_bit() functions unconditionally.
> Ah well, it can be optimized later.
This macro causes issues with lines like:
for_each_subsys_which(ss, ssid, ~cgrp_dfl_root_inhibit_ss_mask)
In addition, there are a bunch of cgroup_* functions that use unsigned
ints for bitops (cgroup_calc_child_subsys_mask, rebind_subsystems,
cgroup_print_ss_mask). Is there a better solution to this problem, or
should I just switch back to my naive solution?
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello,
On Mon, Apr 27, 2015 at 02:05:47AM +1000, Aleksa Sarai wrote:
> In addition, there are a bunch of cgroup_* functions that use unsigned
> ints for bitops (cgroup_calc_child_subsys_mask, rebind_subsystems,
> cgroup_print_ss_mask). Is there a better solution to this problem, or
> should I just switch back to my naive solution?
Hmmmm... You can either convert all masks to ulong (which is fine) or
do something like the following.
#define for_each_subsys_which(ss_mask, ss, ssid) \
unsigned long __tmp_mask = (ss_mask); \
for_each_set_bit(ssid, &__tmp_mask, CGROUP_SUBSYS_COUNT)\
if ((ss) = group_subsys[ssid] && false) \
; \
else
Thanks.
--
tejun
Hi Tejun,
> Hmmmm... You can either convert all masks to ulong (which is fine) or
> do something like the following.
>
> #define for_each_subsys_which(ss_mask, ss, ssid) \
> unsigned long __tmp_mask = (ss_mask); \
> for_each_set_bit(ssid, &__tmp_mask, CGROUP_SUBSYS_COUNT)\
> if ((ss) = group_subsys[ssid] && false) \
> ; \
> else
I think I'll just do casting and manually make new variables where
required. The above (and similar) doesn't work very well, generates
warnings like crazy and breaks stuff like:
if (...)
for_each_subsys_which(...)
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello, Aleksa.
On Wed, May 13, 2015 at 03:44:51PM +1000, Aleksa Sarai wrote:
> I think I'll just do casting and manually make new variables where
> required. The above (and similar) doesn't work very well, generates
> warnings like crazy and breaks stuff like:
>
> if (...)
> for_each_subsys_which(...)
Ah, right, and we can't wrap the declaration in a new block. I'm okay
with converting ss_masks to unsigned longs if necessary. Just do a
wholesale conversion.
Thanks.
--
tejun
Hi Tejun
>> >> + /* We use INT_MAX as the maximum value of pid_t. */
>> >> + if (limit < 0 || limit > INT_MAX)
>> >
>> > This is kinda weird if we're using PIDS_MAX for max as it may end up
>> > showing "max" after some larger number is written to the file.
>>
>> The reason for this is because I believe you said "PIDS_MAX isn't
>> meant to be exposed to userspace" (one of the previous patchsets used
>> PIDS_MAX as the maximum valid value).
>
> Yeah, but wouldn't it be weird to allow the userland to input PIDS_MAX
> (whatever value that may be) and reads back max? It can be whatever
> maximum input value + 1, no?
Would you be okay with this?
if (limit < 0 || limit >= PIDS_MAX)
I'd prefer if we used PIDS_MAX as the maximum input value as well as
being the internal representation of the maximum, rather than
switching to something like INT_MAX.
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello,
On Thu, May 14, 2015 at 03:04:52AM +1000, Aleksa Sarai wrote:
> Would you be okay with this?
>
> if (limit < 0 || limit >= PIDS_MAX)
>
> I'd prefer if we used PIDS_MAX as the maximum input value as well as
> being the internal representation of the maximum, rather than
> switching to something like INT_MAX.
Yeah, that sounds okay to me but I forgot why we went for INT_MAX in
the first place. Do you remember why we tried INT_MAX at all?
Thanks.
--
tejun
>> Would you be okay with this?
>>
>> if (limit < 0 || limit >= PIDS_MAX)
>>
>> I'd prefer if we used PIDS_MAX as the maximum input value as well as
>> being the internal representation of the maximum, rather than
>> switching to something like INT_MAX.
>
> Yeah, that sounds okay to me but I forgot why we went for INT_MAX in
> the first place. Do you remember why we tried INT_MAX at all?
>
> Thanks.
>
> --
> tejun
I think it's because we didn't want to expose PIDS_MAX to userspace.
But we're not *really* exposing it, we're just enforcing the input
limit for "max".
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
On Thu, May 14, 2015 at 03:44:24AM +1000, Aleksa Sarai wrote:
> I think it's because we didn't want to expose PIDS_MAX to userspace.
> But we're not *really* exposing it, we're just enforcing the input
> limit for "max".
Ah, PIDS_MAX is fine then.
Thanks.
--
tejun
Hi Tejun,
>> +/* Ditto for the can_fork/cancel_fork/reapply_fork callbacks. */
>> +static int need_canfork_callback __read_mostly;
>> +static int need_cancelfork_callback __read_mostly;
>
> And given that the reason we have these masks is avoiding iteration in
> relatively hot paths. Does cancelfork mask make sense?
Do you still want me to remove it? I only added it because it made the
callback code more consistent for cancel_fork and can_fork.
>> +void cgroup_cancel_fork(struct task_struct *child,
>> + void *ss_state[CGROUP_PREFORK_COUNT])
>> +{
>> + struct cgroup_subsys *ss;
>> + int i;
>> +
>> + for_each_subsys_which(need_cancelfork_callback, ss, i) {
>> + void *state = NULL;
>> +
>> + if (CGROUP_PREFORK_START <= i && i < CGROUP_PREFORK_END)
>> + state = ss_state[i - CGROUP_PREFORK_START];
>
> Maybe we want a helper callback which does
>
> if (CGROUP_PREFORK_START <= ssid && ssid < CGROUP_PREFORK_END)
> return &ss_state[ssid - CGROUP_PREFORK_START];
> return NULL;
What would be a nice name for it? I can't think of anything better
than __get_ss_private() and __get_ss_privatep().
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello, Aleksa.
On Thu, May 14, 2015 at 08:57:49PM +1000, Aleksa Sarai wrote:
> >> +/* Ditto for the can_fork/cancel_fork/reapply_fork callbacks. */
> >> +static int need_canfork_callback __read_mostly;
> >> +static int need_cancelfork_callback __read_mostly;
> >
> > And given that the reason we have these masks is avoiding iteration in
> > relatively hot paths. Does cancelfork mask make sense?
>
> Do you still want me to remove it? I only added it because it made the
> callback code more consistent for cancel_fork and can_fork.
This doesn't make much sense to me. Why don't we have masks for other
callbacks then? This way, we're breaking the consistency regarding
why the mask is used in the first place.
> > Maybe we want a helper callback which does
> >
> > if (CGROUP_PREFORK_START <= ssid && ssid < CGROUP_PREFORK_END)
> > return &ss_state[ssid - CGROUP_PREFORK_START];
> > return NULL;
>
> What would be a nice name for it? I can't think of anything better
> than __get_ss_private() and __get_ss_privatep().
Do we need the double underscores? subsys_prefork_priv()?
Thanks.
--
tejun
Hi Tejun,
One question RE: defaults for .config. What is the kernel policy for
deciding if a particular subsystem should be made enabled-by-default?
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
On Sat, May 16, 2015 at 01:59:09PM +1000, Aleksa Sarai wrote:
> One question RE: defaults for .config. What is the kernel policy for
> deciding if a particular subsystem should be made enabled-by-default?
Just default to N.
Thanks.
--
tejun