This is a small update to v12 of the pids patchset[1] (most of which was
merged into Tejun's tree, but was later reverted due to some bogus GCC
warnings). The main changes are:
* Fix up include/linux/cgroup.h changes to take into account Tejun's
refactoring of cgroup.h into cgroup-defs.h.
* Move ->can_attach() and ->cancel_attach() charge/revert into
->attach(), since we can't fail and it's simpler to just charge there.
* Remove tset_get_css(), because the premise of its existence is flawed
and switch ->attach() code to just task_get_css() itself.
* Remove a patch that was rendered uneccesary by the cgroup.h refactor.
* Fix up the for_each_subsys_which patchset so that it doesn't trigger
a GCC warning when CGROUP_SUBSYS_COUNT = 0, but cgroups are enabled.
The warning is bogus anyway, since the whole block becomes a noop in
that case.
[1]: https://lkml.org/lkml/2015/5/18/439
Aleksa Sarai (4):
cgroup: use bitmask to filter for_each_subsys
cgroup: replace explicit ss_mask checking with for_each_subsys_which
cgroup: allow a cgroup subsystem to reject a fork
cgroup: implement the PIDs subsystem
Tejun Heo (1):
cgroup, block: implement task_get_css()
CREDITS | 5 +
include/linux/cgroup-defs.h | 12 +-
include/linux/cgroup.h | 41 ++++-
include/linux/cgroup_subsys.h | 28 ++++
init/Kconfig | 16 ++
kernel/Makefile | 1 +
kernel/cgroup.c | 171 +++++++++++++------
kernel/cgroup_freezer.c | 2 +-
kernel/cgroup_pids.c | 377 ++++++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 17 +-
kernel/sched/core.c | 2 +-
11 files changed, 616 insertions(+), 56 deletions(-)
create mode 100644 kernel/cgroup_pids.c
--
2.4.2
Add a new macro for_each_subsys_which that allows all enabled cgroup
subsystems to be filtered by a bitmask, such that mask & (1 << ssid)
determines if the subsystem is to be processed in the loop body (where
ssid is the unique id of the subsystem).
Also replace the need_forkexit_callback with two separate bitmasks for
each callback to make (ss->{fork,exit}) checks unnecessary.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup-defs.h | 2 ++
kernel/cgroup.c | 54 ++++++++++++++++++++++++++++-----------------
2 files changed, 36 insertions(+), 20 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 26d1cea..c5588c4 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -490,6 +490,8 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
#else /* CONFIG_CGROUPS */
+#define CGROUP_SUBSYS_COUNT 0
+
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0fd5227..5734717 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -178,12 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
*/
static u64 css_serial_nr_next = 1;
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -412,6 +413,25 @@ static int notify_on_release(const struct cgroup *cgrp)
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp) \
+ if (!CGROUP_SUBSYS_COUNT) \
+ ; \
+ else \
+ for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
+ if (((ss) = cgroup_subsys[ssid]) && false) \
+ break; \
+ else
+
/* iterate across the hierarchies */
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
@@ -4914,7 +4934,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
- need_forkexit_callback |= ss->fork || ss->exit;
+ have_fork_callback |= (bool)ss->fork << ss->id;
+ have_exit_callback |= (bool)ss->exit << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -5225,11 +5246,8 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- if (need_forkexit_callback) {
- for_each_subsys(ss, i)
- if (ss->fork)
- ss->fork(child);
- }
+ for_each_subsys_which(ss, i, &have_fork_callback)
+ ss->fork(child);
}
/**
@@ -5273,16 +5291,12 @@ void cgroup_exit(struct task_struct *tsk)
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
- if (need_forkexit_callback) {
- /* see cgroup_post_fork() for details */
- for_each_subsys(ss, i) {
- if (ss->exit) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ /* see cgroup_post_fork() for details */
+ for_each_subsys_which(ss, i, &have_exit_callback) {
+ struct cgroup_subsys_state *old_css = cset->subsys[i];
+ struct cgroup_subsys_state *css = task_css(tsk, i);
- ss->exit(css, old_css, tsk);
- }
- }
+ ss->exit(css, old_css, tsk);
}
if (put_cset)
--
2.4.2
Replace the explicit checking against ss_masks inside a for_each_subsys
block with for_each_subsys_which(..., ss_mask), to take advantage of the
more readable (and more efficient) macro.
Signed-off-by: Aleksa Sarai <[email protected]>
---
kernel/cgroup.c | 44 ++++++++++++++++----------------------------
1 file changed, 16 insertions(+), 28 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5734717..c5400d20 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1107,9 +1107,8 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
while (true) {
unsigned long new_ss_mask = cur_ss_mask;
- for_each_subsys(ss, ssid)
- if (cur_ss_mask & (1 << ssid))
- new_ss_mask |= ss->depends_on;
+ for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
@@ -1247,10 +1246,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys(ss, ssid) {
- if (!(ss_mask & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &ss_mask) {
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
@@ -1287,18 +1283,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys(ss, ssid)
- if (ss_mask & (1 << ssid))
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+ for_each_subsys_which(ss, ssid, &ss_mask)
+ cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
- for_each_subsys(ss, ssid) {
+ for_each_subsys_which(ss, ssid, &ss_mask) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
struct css_set *cset;
- if (!(ss_mask & (1 << ssid)))
- continue;
-
src_root = ss->root;
css = cgroup_css(&src_root->cgrp, ss);
@@ -2557,13 +2549,11 @@ static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
bool printed = false;
int ssid;
- for_each_subsys(ss, ssid) {
- if (ss_mask & (1 << ssid)) {
- if (printed)
- seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
- printed = true;
- }
+ for_each_subsys_which(ss, ssid, &ss_mask) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
}
if (printed)
seq_putc(seq, '\n');
@@ -2705,11 +2695,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
+ unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
if (tok[0] == '\0')
continue;
- for_each_subsys(ss, ssid) {
- if (ss->disabled || strcmp(tok + 1, ss->name) ||
- ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+ for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2796,10 +2787,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
* still around. In such cases, wait till it's gone using
* offline_waitq.
*/
- for_each_subsys(ss, ssid) {
- if (!(css_enable & (1 << ssid)))
- continue;
-
+ for_each_subsys_which(ss, ssid, &css_enable) {
cgroup_for_each_live_child(child, cgrp) {
DEFINE_WAIT(wait);
--
2.4.2
From: Tejun Heo <[email protected]>
Implement task_get_css() which find and pins the css for the specified
task and subsys. As a task is always associated with an online css
for every subsystem except while the css_set update is propagating,
task_get_css() retries till css_tryget_online() succeeds.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Li Zefan <[email protected]>
---
include/linux/cgroup.h | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 82319fb..a593e29 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -354,6 +354,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
}
/**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it. This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+ while (true) {
+ css = task_css(task, subsys_id);
+ if (likely(css_tryget_online(css)))
+ break;
+ cpu_relax();
+ }
+ rcu_read_unlock();
+ return css;
+}
+
+/**
* task_css_is_root - test whether a task belongs to the root css
* @task: the target task
* @subsys_id: the target subsystem ID
--
2.4.2
Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected by a cgroup
policy. In addition, add a cancel_fork callback so that if an error
occurs later in the forking process, any state modified by can_fork can
be reverted.
Allow for a private opaque pointer to be passed from cgroup_can_fork to
cgroup_post_fork, allowing for the fork state to be stored by each
subsystem separately.
Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
enumerations to be be defined and used. In addition, explicitly add a
CGROUP_CANFORK_COUNT macro to make arrays easier to define.
This is in preparation for implementing the pids cgroup subsystem.
Signed-off-by: Aleksa Sarai <[email protected]>
---
include/linux/cgroup-defs.h | 10 +++++-
include/linux/cgroup.h | 16 +++++++--
include/linux/cgroup_subsys.h | 23 +++++++++++++
kernel/cgroup.c | 75 +++++++++++++++++++++++++++++++++++++++++--
kernel/cgroup_freezer.c | 2 +-
kernel/fork.c | 17 ++++++++--
kernel/sched/core.c | 2 +-
7 files changed, 136 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5588c4..2009ceb 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,12 +34,17 @@ struct seq_file;
/* define the enumeration of all cgroup subsystems */
#define SUBSYS(_x) _x ## _cgrp_id,
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+ __unused_tag_ ## _t = CGROUP_ ## _t - 1,
enum cgroup_subsys_id {
#include <linux/cgroup_subsys.h>
CGROUP_SUBSYS_COUNT,
};
+#undef SUBSYS_TAG
#undef SUBSYS
+#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
+
/* bits in struct cgroup_subsys_state flags field */
enum {
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
@@ -405,7 +410,9 @@ struct cgroup_subsys {
struct cgroup_taskset *tset);
void (*attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
- void (*fork)(struct task_struct *task);
+ int (*can_fork)(struct task_struct *task, void **priv_p);
+ void (*cancel_fork)(struct task_struct *task, void *priv);
+ void (*fork)(struct task_struct *task, void *priv);
void (*exit)(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task);
@@ -490,6 +497,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
#else /* CONFIG_CGROUPS */
+#define CGROUP_CANFORK_COUNT 0
#define CGROUP_SUBSYS_COUNT 0
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e29..17d0046 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,9 +62,15 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
-void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+ void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+ void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT]);
void cgroup_exit(struct task_struct *p);
+
int cgroup_init_early(void);
int cgroup_init(void);
@@ -524,7 +530,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{ return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+ void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+ void *ss_priv[CGROUP_CANFORK_COUNT]) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb..ec43bce 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,17 @@
*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
+
+/*
+ * This file *must* be included with SUBSYS() defined.
+ * SUBSYS_TAG() is a noop if undefined.
+ */
+
+#ifndef SUBSYS_TAG
+#define __TMP_SUBSYS_TAG
+#define SUBSYS_TAG(_x)
+#endif
+
#if IS_ENABLED(CONFIG_CPUSETS)
SUBSYS(cpuset)
#endif
@@ -48,11 +59,23 @@ SUBSYS(hugetlb)
#endif
/*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+SUBSYS_TAG(CANFORK_END)
+
+/*
* The following subsystems are not supported on the default hierarchy.
*/
#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
SUBSYS(debug)
#endif
+
+#ifdef __TMP_SUBSYS_TAG
+#undef __TMP_SUBSYS_TAG
+#undef SUBSYS_TAG
+#endif
+
/*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c5400d20..a24a335 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -186,6 +186,9 @@ static u64 css_serial_nr_next = 1;
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -4924,6 +4927,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -5166,6 +5170,21 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+ return &ss_priv[i - CGROUP_CANFORK_START];
+ return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ void **private;
+ if ((private = subsys_canfork_priv_p(ss_priv, i)) != NULL)
+ return *private;
+ return NULL;
+}
+
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5181,6 +5200,57 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ for_each_subsys_which(ss, i, &have_canfork_callback) {
+ ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ if (ret)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ }
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if(ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5190,7 +5260,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5235,7 +5306,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child);
+ ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
}
/**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc..f1b30ad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9531275..de7ff51 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1241,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1512,6 +1513,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted the the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, cgrp_ss_priv);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1547,7 +1558,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1589,7 +1600,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, cgrp_ss_priv);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1599,6 +1610,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a8..050936e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8007,7 +8007,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
--
2.4.2
Adds a new single-purpose PIDs subsystem to limit the number of
tasks that can be forked inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that applies to a cgroup rather than a
process tree.
However, it should be noted that organisational operations (adding and
removing tasks from a PIDs hierarchy) will *not* be prevented. Rather,
the number of tasks in the hierarchy cannot exceed the limit through
forking. This is due to the fact that, in the unified hierarchy, attach
cannot fail (and it is not possible for a task to overcome its PIDs
cgroup policy limit by attaching to a child cgroup -- even if migrating
mid-fork it must be able to fork in the parent first).
PIDs are fundamentally a global resource, and it is possible to reach
PID exhaustion inside a cgroup without hitting any reasonable kmemcg
policy. Once you've hit PID exhaustion, you're only in a marginally
better state than OOM. This subsystem allows PID exhaustion inside a
cgroup to be prevented.
Signed-off-by: Aleksa Sarai <[email protected]>
---
CREDITS | 5 +
include/linux/cgroup_subsys.h | 5 +
init/Kconfig | 16 ++
kernel/Makefile | 1 +
kernel/cgroup_pids.c | 377 ++++++++++++++++++++++++++++++++++++++++++
5 files changed, 404 insertions(+)
create mode 100644 kernel/cgroup_pids.c
diff --git a/CREDITS b/CREDITS
index 40cc4bf..0727426 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3215,6 +3215,11 @@ S: 69 rue Dunois
S: 75013 Paris
S: France
+N: Aleksa Sarai
+E: [email protected]
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
+
N: Dipankar Sarma
E: [email protected]
D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ec43bce..1f36945 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
* Subsystems that implement the can_fork() family of callbacks.
*/
SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
SUBSYS_TAG(CANFORK_END)
/*
diff --git a/init/Kconfig b/init/Kconfig
index b9b824b..f4e4918 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -968,6 +968,22 @@ config CGROUP_FREEZER
Provides a way to freeze and unfreeze all tasks in a
cgroup.
+config CGROUP_PIDS
+ bool "PIDs cgroup subsystem"
+ help
+ Provides enforcement of process number limits in the scope of a
+ cgroup. Any attempt to fork more processes than is allowed in the
+ cgroup will fail. PIDs are fundamentally a global resource because it
+ is fairly trivial to reach PID exhaustion before you reach even a
+ conservative kmemcg limit. As a result, it is possible to grind a
+ system to halt without being limited by other cgroup policies. The
+ PIDs cgroup subsystem is designed to stop this from happening.
+
+ It should be noted that organisational operations (such as attaching
+ to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ since the PIDs limit only affects a process's ability to fork, not to
+ attach to a cgroup.
+
config CGROUP_DEVICE
bool "Device controller for cgroups"
help
diff --git a/kernel/Makefile b/kernel/Makefile
index 0f8f8b0..df5406c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 0000000..7965af2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,377 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <[email protected]>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/*
+ * Lockless hierarchical accounting (with enforceable limits) derived from
+ * mm/page_counter.c. Original copyright notice from page_counter code:
+ *
+ * -------------------------------------------------
+ * Lockless hierarchical page accounting & limiting
+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
+ * -------------------------------------------------
+ */
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task, *fail_task;
+ int err;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * Grab a ref to each task's css. We don't drop the ref until
+ * we either fail and hit ->cancel_attach() or succeed and hit
+ * ->attach().
+ */
+ old_css = task_get_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ css_put(old_css);
+ }
+}
+
+static void pids_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset)
+ css_put(task_css(task, pids_cgrp_id));
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int err;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ css = task_get_css(current, pids_cgrp_id);
+ pids = css_pids(css);
+
+ err = pids_try_charge(pids, 1);
+ if (err)
+ goto err_css_put;
+
+ *priv_p = css;
+ return 0;
+
+err_css_put:
+ css_put(css);
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css = priv;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = priv;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ css = task_get_css(task, pids_cgrp_id);
+ pids = css_pids(css);
+
+ /*
+ * If the association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(old_css);
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .attach = pids_attach,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .exit = pids_exit,
+ .legacy_cftypes = files,
+ .early_init = 0,
+};
--
2.4.2
On Sat, Jun 06, 2015 at 10:02:15AM +1000, Aleksa Sarai wrote:
> Replace the explicit checking against ss_masks inside a for_each_subsys
> block with for_each_subsys_which(..., ss_mask), to take advantage of the
> more readable (and more efficient) macro.
>
> Signed-off-by: Aleksa Sarai <[email protected]>
Applied 1-2 to cgroup/for-4.2.
Thanks.
--
tejun
On Sat, Jun 06, 2015 at 10:02:16AM +1000, Aleksa Sarai wrote:
> From: Tejun Heo <[email protected]>
>
> Implement task_get_css() which find and pins the css for the specified
> task and subsys. As a task is always associated with an online css
> for every subsystem except while the css_set update is propagating,
> task_get_css() retries till css_tryget_online() succeeds.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: Li Zefan <[email protected]>
Another form of this patch was already in block/for-4.2/writeback. I
cherry picked that commit into cgroup/for-4.2 to avoid pulling the
whole thing into cgroup tree.
Thanks.
--
tejun
Hello, Aleksa.
Looks pretty good to me in general. Some minor comments below.
On Sat, Jun 06, 2015 at 10:02:17AM +1000, Aleksa Sarai wrote:
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index a593e29..17d0046 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -62,9 +62,15 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
> struct pid *pid, struct task_struct *tsk);
> void cgroup_fork(struct task_struct *p);
> -void cgroup_post_fork(struct task_struct *p);
> +extern int cgroup_can_fork(struct task_struct *p,
> + void *ss_priv[CGROUP_CANFORK_COUNT]);
> +extern void cgroup_cancel_fork(struct task_struct *p,
> + void *ss_priv[CGROUP_CANFORK_COUNT]);
> +extern void cgroup_post_fork(struct task_struct *p,
> + void *old_ss_priv[CGROUP_CANFORK_COUNT]);
> void cgroup_exit(struct task_struct *p);
>
> +
Is this blank line intentional?
> int cgroup_init_early(void);
> int cgroup_init(void);
...
> @@ -4924,6 +4927,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
>
> have_fork_callback |= (bool)ss->fork << ss->id;
> have_exit_callback |= (bool)ss->exit << ss->id;
> + have_canfork_callback |= (bool)ss->can_fork << ss->id;
Hmmm.... do we still need this mask? We're already restricting
iteration pretty heavily. I'd even suggest dropping both
have_fork_callback and have_exit_callback too and just put them inside
CGROUP_FORK_EXIT_START / STOP although that doesn't belong in this
patchset.
...
> +static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
> +{
> + void **private;
> + if ((private = subsys_canfork_priv_p(ss_priv, i)) != NULL)
> + return *private;
> + return NULL;
> +}
void **private = subsys_canfork...;
if (private)
return *private;
return NULL;
or even just
return private ? *private : NULL;
We conventionally don't put assignments in if conditionals.
> +void cgroup_cancel_fork(struct task_struct *child,
> + void *ss_priv[CGROUP_CANFORK_COUNT])
> +{
> + struct cgroup_subsys *ss;
> + int i;
> +
> + for_each_subsys(ss, i)
> + if(ss->cancel_fork)
^
space
> + ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
> +}
> +
> +/**
> * cgroup_post_fork - called on a new task after adding it to the task list
> * @child: the task in question
> *
Thanks.
--
tejun
Hello, Aleksa.
On Sat, Jun 06, 2015 at 10:02:18AM +1000, Aleksa Sarai wrote:
...
> +/*
> + * Lockless hierarchical accounting (with enforceable limits) derived from
> + * mm/page_counter.c. Original copyright notice from page_counter code:
> + *
> + * -------------------------------------------------
> + * Lockless hierarchical page accounting & limiting
> + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
> + * -------------------------------------------------
> + */
I don't think we need the above copyright notice.
...
> +struct cgroup_subsys pids_cgrp_subsys = {
> + .css_alloc = pids_css_alloc,
> + .css_free = pids_css_free,
> + .attach = pids_attach,
> + .can_attach = pids_can_attach,
> + .cancel_attach = pids_cancel_attach,
> + .can_fork = pids_can_fork,
> + .cancel_fork = pids_cancel_fork,
> + .fork = pids_fork,
> + .exit = pids_exit,
> + .legacy_cftypes = files,
Please also init dfl_cftypes the same way too. Also, can you please
rename files to pids_files?
> + .early_init = 0,
And the above can be dropped.
Other than the above minor points, everything looks good to me but I
think we're a bit too close to the merge window to put this in now.
Once you update patch 4 and 5 according to the review, I'll apply them
after the next rc1 drops.
Thanks.
--
tejun
>> @@ -4924,6 +4927,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
>>
>> have_fork_callback |= (bool)ss->fork << ss->id;
>> have_exit_callback |= (bool)ss->exit << ss->id;
>> + have_canfork_callback |= (bool)ss->can_fork << ss->id;
>
> Hmmm.... do we still need this mask? We're already restricting
> iteration pretty heavily.
CGROUP_CANFORK_{START,END,COUNT} aren't used to restrict the
iteration. They're used for restricting the size of the @ss_priv
array. If you want, I can use CANFORK_{START,END} to restrict the
iteration -- I just prefer using the for_each_subsys_which API for
iterating over active cgroups. :/
--
Aleksa Sarai (cyphar)
http://www.cyphar.com
Hello,
On Tue, Jun 9, 2015 at 4:20 PM, Aleksa Sarai <[email protected]> wrote:
> CGROUP_CANFORK_{START,END,COUNT} aren't used to restrict the
> iteration. They're used for restricting the size of the @ss_priv
> array. If you want, I can use CANFORK_{START,END} to restrict the
> iteration -- I just prefer using the for_each_subsys_which API for
> iterating over active cgroups. :/
Ah, I see. Hmm... yeah, let's keep the code for now, but I think it'd
be cheaper / cleaner to use those macro tags to restrict iteration and
drop all the masks in the future.
Thanks.
--
tejun