2009-01-21 11:11:16

by Nikanth Karthikesan

[permalink] [raw]
Subject: [RFC] [PATCH] Cgroup based OOM killer controller

As Alan Cox suggested/wondered in this thread,
http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
to override the oom killer selection without losing all the benefits of the
current oom killer heuristics and oom_adj interface.

It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
process using the usual badness value but only within the cgroup with the
maximum value for oom.victim before killing any process from a cgroup with a
lesser oom.victim number. Oom killing could be disabled by setting
oom.victim=0.

Signed-off-by: Nikanth Karthikesan <[email protected]>

---

diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt
new file mode 100644
index 0000000..9102830
--- /dev/null
+++ b/Documentation/cgroups/oom.txt
@@ -0,0 +1,29 @@
+OOM Killer controller
+--- ------ ----------
+
+The OOM killer kills the process based on a set of heuristics such that only
+minimum amount of work done will be lost, a large amount of memory would be
+recovered and minimum no of processes are killed.
+
+The user can adjust the score used to select the processes to be killed using
+/proc/<pid>/oom_adj. Giving it a high score will increase the likelihood of
+this process being killed by the oom-killer. Valid values are in the range
+-16 to +15, plus the special value -17, which disables oom-killing altogether
+for that process.
+
+But it is very difficult to suggest an order among tasks to be killed during
+Out Of Memory situation. The OOM Killer controller aids in doing that.
+
+USAGE
+-----
+
+Mount the oom controller by passing 'oom' when mounting cgroups. Echo
+a value in oom.victim file to change the order. The oom killer would kill
+all the processes in a cgroup with a higher oom.victim value before killing a
+process in a cgroup with lower oom.victim value. Among those tasks with same
+oom.victim value, the usual badness heuristics would be applied. The
+/proc/<pid>/oom_adj still helps adjusting the oom killer score. Also having
+oom.victim = 0 would disable oom killing for the tasks in that cgroup.
+
+Note: If this is used without proper consideration, innocent processes may
+get killed unnecesarily.
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..6944f99 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,4 +59,8 @@ SUBSYS(freezer)
SUBSYS(net_cls)
#endif

+#ifdef CONFIG_CGROUP_OOM
+SUBSYS(oom)
+#endif
+
/* */
diff --git a/include/linux/oomcontrol.h b/include/linux/oomcontrol.h
new file mode 100644
index 0000000..18146ab
--- /dev/null
+++ b/include/linux/oomcontrol.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_OOMCONTROL_H
+#define _LINUX_OOMCONTROL_H
+
+#ifdef CONFIG_CGROUP_OOM
+struct oom_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the order to be victimized for this group
+ */
+ u64 victim;
+};
+
+#endif
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 2af8382..99ed0de 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -354,6 +354,15 @@ config CGROUP_DEBUG

Say N if unsure.

+config CGROUP_OOM
+ bool "Oom cgroup subsystem"
+ depends on CGROUPS
+ help
+ This provides a cgroup subsystem which aids controlling
+ the order in which tasks whould be killed during
+ out of memory situations.
+
+
config CGROUP_NS
bool "Namespace cgroup subsystem"
depends on CGROUPS
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..a5d7222 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_OOM) += oomcontrol.o
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba050..f697d50 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/oomcontrol.h>
#include <linux/security.h>

int sysctl_panic_on_oom;
@@ -205,6 +206,9 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+#ifdef CONFIG_CGROUP_OOM
+ u64 chosenvictim = 1, taskvictim;
+#endif
*ppoints = 0;

do_posix_clock_monotonic_gettime(&uptime);
@@ -257,10 +261,26 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
continue;

points = badness(p, uptime.tv_sec);
+#ifdef CONFIG_CGROUP_OOM
+ taskvictim = (container_of(p->cgroups->subsys[oom_subsys_id],
+ struct oom_cgroup, css))->victim;
+
+ if (taskvictim > chosenvictim ||
+ (taskvictim == chosenvictim && points > *ppoints) ||
+ !chosen) {
+
+ chosen = p;
+ *ppoints = points;
+ chosenvictim = taskvictim;
+
+ }
+
+#else
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
+#endif
} while_each_thread(g, p);

return chosen;
diff --git a/mm/oomcontrol.c b/mm/oomcontrol.c
new file mode 100644
index 0000000..2bfa325
--- /dev/null
+++ b/mm/oomcontrol.c
@@ -0,0 +1,95 @@
+/*
+ * kernel/cgroup_oom.c - oom handler cgroup.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/oomcontrol.h>
+
+/*
+ * Helper to retrieve oom controller data from cgroup
+ */
+static struct oom_cgroup *oom_css_from_cgroup(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp,
+ oom_subsys_id), struct oom_cgroup,
+ css);
+}
+
+
+static struct cgroup_subsys_state *oom_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct oom_cgroup *oom_css = kzalloc(sizeof(*oom_css), GFP_KERNEL);
+ struct oom_cgroup *parent;
+
+ if (!oom_css)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * if root last/only group to be victimized
+ * else inherit parents value
+ */
+ if (cont->parent == NULL)
+ oom_css->victim = 1;
+ else {
+ parent = oom_css_from_cgroup(cont->parent);
+ oom_css->victim = parent->victim;
+ }
+
+ return &oom_css->css;
+}
+
+static void oom_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[oom_subsys_id]);
+}
+
+static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+
+ cgroup_lock();
+
+ (oom_css_from_cgroup(cgrp))->victim = val;
+
+ cgroup_unlock();
+
+ return 0;
+}
+
+static u64 oom_victim_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ u64 victim = (oom_css_from_cgroup(cgrp))->victim;
+
+ return victim;
+}
+
+
+static struct cftype oom_cgroup_files[] = {
+ {
+ .name = "victim",
+ .read_u64 = oom_victim_read,
+ .write_u64 = oom_victim_write,
+ },
+};
+
+static int oom_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int ret;
+
+ ret = cgroup_add_files(cont, ss, oom_cgroup_files,
+ ARRAY_SIZE(oom_cgroup_files));
+
+ return ret;
+}
+
+struct cgroup_subsys oom_subsys = {
+ .name = "oom",
+ .subsys_id = oom_subsys_id,
+ .create = oom_create,
+ .destroy = oom_destroy,
+ .populate = oom_populate,
+};


2009-01-21 13:17:50

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, Jan 21, 2009 at 04:38:21PM +0530, Nikanth Karthikesan ([email protected]) wrote:
> As Alan Cox suggested/wondered in this thread,
> http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> to override the oom killer selection without losing all the benefits of the
> current oom killer heuristics and oom_adj interface.
>
> It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> process using the usual badness value but only within the cgroup with the
> maximum value for oom.victim before killing any process from a cgroup with a
> lesser oom.victim number. Oom killing could be disabled by setting
> oom.victim=0.

Looks good, except that in some conditions (everyone has zero for
example) victim=0 does not disalbe oom-killer because of !chosen
part of the condition.

--
Evgeniy Polyakov

2009-01-21 15:27:20

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wednesday 21 January 2009 18:47:39 Evgeniy Polyakov wrote:
> On Wed, Jan 21, 2009 at 04:38:21PM +0530, Nikanth Karthikesan
([email protected]) wrote:
> > As Alan Cox suggested/wondered in this thread,
> > http://lkml.org/lkml/2009/1/12/235 , this is a container group based
> > approach to override the oom killer selection without losing all the
> > benefits of the current oom killer heuristics and oom_adj interface.
> >
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill
> > the process using the usual badness value but only within the cgroup with
> > the maximum value for oom.victim before killing any process from a cgroup
> > with a lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
>
> Looks good, except that in some conditions (everyone has zero for
> example) victim=0 does not disalbe oom-killer because of !chosen
> part of the condition.

Thanks for reviewing. Yes, I missed that condition. Adding a check for
victim!=0 should fix it. Here is a revised patch with that fixed.

Thanks
Nikanth Karthikesan

From: Nikanth Karthikesan <[email protected]>

Cgroup based OOM killer controller

Signed-off-by: Nikanth Karthikesan <[email protected]>

---

This is a container group based approach to override the oom killer selection
without losing all the benefits of the current oom killer heuristics and
oom_adj interface.

It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
process using the usual badness value but only within the cgroup with the
maximum value for oom.victim before killing any process from a cgroup with a
lesser oom.victim number. Oom killing could be disabled by setting
oom.victim=0.

diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt
new file mode 100644
index 0000000..9102830
--- /dev/null
+++ b/Documentation/cgroups/oom.txt
@@ -0,0 +1,29 @@
+OOM Killer controller
+--- ------ ----------
+
+The OOM killer kills the process based on a set of heuristics such that only
+minimum amount of work done will be lost, a large amount of memory would be
+recovered and minimum no of processes are killed.
+
+The user can adjust the score used to select the processes to be killed using
+/proc/<pid>/oom_adj. Giving it a high score will increase the likelihood of
+this process being killed by the oom-killer. Valid values are in the range
+-16 to +15, plus the special value -17, which disables oom-killing altogether
+for that process.
+
+But it is very difficult to suggest an order among tasks to be killed during
+Out Of Memory situation. The OOM Killer controller aids in doing that.
+
+USAGE
+-----
+
+Mount the oom controller by passing 'oom' when mounting cgroups. Echo
+a value in oom.victim file to change the order. The oom killer would kill
+all the processes in a cgroup with a higher oom.victim value before killing a
+process in a cgroup with lower oom.victim value. Among those tasks with same
+oom.victim value, the usual badness heuristics would be applied. The
+/proc/<pid>/oom_adj still helps adjusting the oom killer score. Also having
+oom.victim = 0 would disable oom killing for the tasks in that cgroup.
+
+Note: If this is used without proper consideration, innocent processes may
+get killed unnecesarily.
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..6944f99 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,4 +59,8 @@ SUBSYS(freezer)
SUBSYS(net_cls)
#endif

+#ifdef CONFIG_CGROUP_OOM
+SUBSYS(oom)
+#endif
+
/* */
diff --git a/include/linux/oomcontrol.h b/include/linux/oomcontrol.h
new file mode 100644
index 0000000..18146ab
--- /dev/null
+++ b/include/linux/oomcontrol.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_OOMCONTROL_H
+#define _LINUX_OOMCONTROL_H
+
+#ifdef CONFIG_CGROUP_OOM
+struct oom_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the order to be victimized for this group
+ */
+ u64 victim;
+};
+
+#endif
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 2af8382..99ed0de 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -354,6 +354,15 @@ config CGROUP_DEBUG

Say N if unsure.

+config CGROUP_OOM
+ bool "Oom cgroup subsystem"
+ depends on CGROUPS
+ help
+ This provides a cgroup subsystem which aids controlling
+ the order in which tasks whould be killed during
+ out of memory situations.
+
+
config CGROUP_NS
bool "Namespace cgroup subsystem"
depends on CGROUPS
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..a5d7222 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_OOM) += oomcontrol.o
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba050..64fa38e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/oomcontrol.h>
#include <linux/security.h>

int sysctl_panic_on_oom;
@@ -205,6 +206,9 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+#ifdef CONFIG_CGROUP_OOM
+ u64 chosenvictim = 1, taskvictim;
+#endif
*ppoints = 0;

do_posix_clock_monotonic_gettime(&uptime);
@@ -257,10 +261,26 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
continue;

points = badness(p, uptime.tv_sec);
+#ifdef CONFIG_CGROUP_OOM
+ taskvictim = (container_of(p->cgroups->subsys[oom_subsys_id],
+ struct oom_cgroup, css))->victim;
+
+ if (taskvictim > chosenvictim ||
+ (taskvictim == chosenvictim && points > *ppoints) ||
+ (taskvictim && !chosen)) {
+
+ chosen = p;
+ *ppoints = points;
+ chosenvictim = taskvictim;
+
+ }
+
+#else
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
+#endif
} while_each_thread(g, p);

return chosen;
diff --git a/mm/oomcontrol.c b/mm/oomcontrol.c
new file mode 100644
index 0000000..2bfa325
--- /dev/null
+++ b/mm/oomcontrol.c
@@ -0,0 +1,95 @@
+/*
+ * kernel/cgroup_oom.c - oom handler cgroup.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/oomcontrol.h>
+
+/*
+ * Helper to retrieve oom controller data from cgroup
+ */
+static struct oom_cgroup *oom_css_from_cgroup(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp,
+ oom_subsys_id), struct oom_cgroup,
+ css);
+}
+
+
+static struct cgroup_subsys_state *oom_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct oom_cgroup *oom_css = kzalloc(sizeof(*oom_css), GFP_KERNEL);
+ struct oom_cgroup *parent;
+
+ if (!oom_css)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * if root last/only group to be victimized
+ * else inherit parents value
+ */
+ if (cont->parent == NULL)
+ oom_css->victim = 1;
+ else {
+ parent = oom_css_from_cgroup(cont->parent);
+ oom_css->victim = parent->victim;
+ }
+
+ return &oom_css->css;
+}
+
+static void oom_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[oom_subsys_id]);
+}
+
+static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+
+ cgroup_lock();
+
+ (oom_css_from_cgroup(cgrp))->victim = val;
+
+ cgroup_unlock();
+
+ return 0;
+}
+
+static u64 oom_victim_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ u64 victim = (oom_css_from_cgroup(cgrp))->victim;
+
+ return victim;
+}
+
+
+static struct cftype oom_cgroup_files[] = {
+ {
+ .name = "victim",
+ .read_u64 = oom_victim_read,
+ .write_u64 = oom_victim_write,
+ },
+};
+
+static int oom_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int ret;
+
+ ret = cgroup_add_files(cont, ss, oom_cgroup_files,
+ ARRAY_SIZE(oom_cgroup_files));
+
+ return ret;
+}
+
+struct cgroup_subsys oom_subsys = {
+ .name = "oom",
+ .subsys_id = oom_subsys_id,
+ .create = oom_create,
+ .destroy = oom_destroy,
+ .populate = oom_populate,
+};

2009-01-21 20:51:34

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, 21 Jan 2009, Nikanth Karthikesan wrote:

> This is a container group based approach to override the oom killer selection
> without losing all the benefits of the current oom killer heuristics and
> oom_adj interface.
>
> It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> process using the usual badness value but only within the cgroup with the
> maximum value for oom.victim before killing any process from a cgroup with a
> lesser oom.victim number. Oom killing could be disabled by setting
> oom.victim=0.
>

This doesn't help in memcg or cpuset constrained oom conditions, which
still go through select_bad_process().

If the oom.victim value is high for a specific cgroup and a memory
controller oom occurs in a disjoint cgroup, for example, it's possible to
needlessly kill tasks. Obviously that is up to the administrator to
configure, but may not be his or her desire for system-wide oom
conditions.

It may be preferred to kill tasks in a specific cgroup first when the
entire system is out of memory or kill tasks within a cgroup attached to a
memory controller when it is oom.

The same scenario applies for cpuset-constrained ooms. Since oom.victim
is given higher preference than all tasks' oom_adj values, it is possible
to needlessly kill tasks that do not lead to future memory freeing for the
nodes attached to that cpuset.

It also requires that you synchronize the oom.victim values amongst your
cgroups.

2009-01-22 02:54:41

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, 21 Jan 2009 12:49:50 -0800 (PST)
David Rientjes <[email protected]> wrote:

> On Wed, 21 Jan 2009, Nikanth Karthikesan wrote:
>
> > This is a container group based approach to override the oom killer selection
> > without losing all the benefits of the current oom killer heuristics and
> > oom_adj interface.
> >
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> > process using the usual badness value but only within the cgroup with the
> > maximum value for oom.victim before killing any process from a cgroup with a
> > lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
> >
>
> This doesn't help in memcg or cpuset constrained oom conditions, which
> still go through select_bad_process().
>
> If the oom.victim value is high for a specific cgroup and a memory
> controller oom occurs in a disjoint cgroup, for example, it's possible to
> needlessly kill tasks. Obviously that is up to the administrator to
> configure, but may not be his or her desire for system-wide oom
> conditions.
>
Hmm...after this patch, select_bad_process's filter to select process will be

==
1. ->mm is NULL ? => don't select this
2. is init task ? => don't select this
3. is under specified memcg ? => don't select this
4. marked as MEMDIE ? => return -1.
5. PF_EXITING? => select this.
6. OOM_DISABLE ? => don't select this
points = badness(p, uptime.tv_sec);
7. adjust point & select logic depends on OOM cgroup
==

Not looks good ;)

> It may be preferred to kill tasks in a specific cgroup first when the
> entire system is out of memory or kill tasks within a cgroup attached to a
> memory controller when it is oom.
>

I agree here.

Above filter logic should be
==
current_victim_level++;
1. p is under oom cgroup of victim_level > current_victim_level => don't select this.
2. ->mm is NULL ? => don't select this
3. is init task ? => don't select this
4. is under specified memcg ? => don't select this
5. marked as MEMDIE ? => return -1.
6. PF_EXITING? => select this.
7. OOM_DISABLE ? => don't select this
points = badness(p, uptime.tv_sec)
==
But this will be too slow.

I think do_each_thread() in select_bad_process() should be replaced with
a routine like this, finally.
==
for_each_oom_cgroup_in_victim_value_order() {
for_each_threads_in_oom_cgroup(oom) {
select one bad thread.
}
if (selected_one_is_enough_bad ?)
return selected_one;
}
==

And this can be a help for "spped up OOM killer" problem.

Thanks,
-Kame

2009-01-22 03:30:00

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, 21 Jan 2009 16:38:21 +0530
Nikanth Karthikesan <[email protected]> wrote:

> As Alan Cox suggested/wondered in this thread,
> http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> to override the oom killer selection without losing all the benefits of the
> current oom killer heuristics and oom_adj interface.
>
> It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> process using the usual badness value but only within the cgroup with the
> maximum value for oom.victim before killing any process from a cgroup with a
> lesser oom.victim number. Oom killing could be disabled by setting
> oom.victim=0.
>
> Signed-off-by: Nikanth Karthikesan <[email protected]>
>

Assume following
- the usar can tell "which process should be killed at first"

What is the difference between oom_adj and this cgroup to users ?
If oom_adj is hard to use, making it simpler is a good way, I think.
rather than adding new complication.

It seems both of oom_adj and this cgroup will be hard-to-use functions
for usual system administrators. But no better idea than using memcg
and committing memory usage.

Thanks,
-Kame

2009-01-22 05:15:20

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 02:19:50 David Rientjes wrote:
> On Wed, 21 Jan 2009, Nikanth Karthikesan wrote:
> > This is a container group based approach to override the oom killer
> > selection without losing all the benefits of the current oom killer
> > heuristics and oom_adj interface.
> >
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill
> > the process using the usual badness value but only within the cgroup with
> > the maximum value for oom.victim before killing any process from a cgroup
> > with a lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
>
> This doesn't help in memcg or cpuset constrained oom conditions, which
> still go through select_bad_process().
>
> If the oom.victim value is high for a specific cgroup and a memory
> controller oom occurs in a disjoint cgroup, for example, it's possible to
> needlessly kill tasks. Obviously that is up to the administrator to
> configure, but may not be his or her desire for system-wide oom
> conditions.
>
> It may be preferred to kill tasks in a specific cgroup first when the
> entire system is out of memory or kill tasks within a cgroup attached to a
> memory controller when it is oom.
>
> The same scenario applies for cpuset-constrained ooms. Since oom.victim
> is given higher preference than all tasks' oom_adj values, it is possible
> to needlessly kill tasks that do not lead to future memory freeing for the
> nodes attached to that cpuset.
>
> It also requires that you synchronize the oom.victim values amongst your
> cgroups.

No, this is not specific to memcg or cpuset cases alone. The same needless
kills will take place even without memcg or cpuset when an administrator
specifies a light memory consumer to be killed before a heavy memory user. But
it is up to the administrator to use it wisely. We also provide a panic_on_oom
option that an administrator could use, not just to kill few more tasks but
all tasks in the system ;)

Thanks
Nikanth

2009-01-22 05:15:40

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 08:23:24 KAMEZAWA Hiroyuki wrote:
> On Wed, 21 Jan 2009 12:49:50 -0800 (PST)
>
> David Rientjes <[email protected]> wrote:
> > On Wed, 21 Jan 2009, Nikanth Karthikesan wrote:
> > > This is a container group based approach to override the oom killer
> > > selection without losing all the benefits of the current oom killer
> > > heuristics and oom_adj interface.
> > >
> > > It adds a tunable oom.victim to the oom cgroup. The oom killer will
> > > kill the process using the usual badness value but only within the
> > > cgroup with the maximum value for oom.victim before killing any process
> > > from a cgroup with a lesser oom.victim number. Oom killing could be
> > > disabled by setting oom.victim=0.
> >
> > This doesn't help in memcg or cpuset constrained oom conditions, which
> > still go through select_bad_process().
> >
> > If the oom.victim value is high for a specific cgroup and a memory
> > controller oom occurs in a disjoint cgroup, for example, it's possible to
> > needlessly kill tasks. Obviously that is up to the administrator to
> > configure, but may not be his or her desire for system-wide oom
> > conditions.
>
> Hmm...after this patch, select_bad_process's filter to select process will
> be
>
> ==
> 1. ->mm is NULL ? => don't select this
> 2. is init task ? => don't select this
> 3. is under specified memcg ? => don't select this
> 4. marked as MEMDIE ? => return -1.
> 5. PF_EXITING? => select this.
> 6. OOM_DISABLE ? => don't select this
> points = badness(p, uptime.tv_sec);
> 7. adjust point & select logic depends on OOM cgroup
> ==
>
> Not looks good ;)
>

Yes, we do throw away a lot of needless work done. But this is how we already
do and this is not a regression. But this could be used to improve the OOM
killer's speed.

> > It may be preferred to kill tasks in a specific cgroup first when the
> > entire system is out of memory or kill tasks within a cgroup attached to
> > a memory controller when it is oom.
>
> I agree here.
>
> Above filter logic should be
> ==
> current_victim_level++;
> 1. p is under oom cgroup of victim_level > current_victim_level => don't
> select this. 2. ->mm is NULL ? => don't select this
> 3. is init task ? => don't select this
> 4. is under specified memcg ? => don't select this
> 5. marked as MEMDIE ? => return -1.
> 6. PF_EXITING? => select this.
> 7. OOM_DISABLE ? => don't select this
> points = badness(p, uptime.tv_sec)
> ==
> But this will be too slow.
>
> I think do_each_thread() in select_bad_process() should be replaced with
> a routine like this, finally.
> ==
> for_each_oom_cgroup_in_victim_value_order() {
> for_each_threads_in_oom_cgroup(oom) {
> select one bad thread.
> }
> if (selected_one_is_enough_bad ?)
> return selected_one;
> }
> ==
>

Yes.

> And this can be a help for "spped up OOM killer" problem.
>

Yes.

Thanks
Nikanth

2009-01-22 05:15:59

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 08:58:43 KAMEZAWA Hiroyuki wrote:
> On Wed, 21 Jan 2009 16:38:21 +0530
>
> Nikanth Karthikesan <[email protected]> wrote:
> > As Alan Cox suggested/wondered in this thread,
> > http://lkml.org/lkml/2009/1/12/235 , this is a container group based
> > approach to override the oom killer selection without losing all the
> > benefits of the current oom killer heuristics and oom_adj interface.
> >
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill
> > the process using the usual badness value but only within the cgroup with
> > the maximum value for oom.victim before killing any process from a cgroup
> > with a lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
> >
> > Signed-off-by: Nikanth Karthikesan <[email protected]>
>
> Assume following
> - the usar can tell "which process should be killed at first"
>
> What is the difference between oom_adj and this cgroup to users ?

It is next to impossible to specify the order among say 10 memory hogging
tasks using oom_adj. Using this oom-controller users can specify the exact
order.

> If oom_adj is hard to use, making it simpler is a good way, I think.
> rather than adding new complication.
>
> It seems both of oom_adj and this cgroup will be hard-to-use functions
> for usual system administrators. But no better idea than using memcg
> and committing memory usage.
>

To use oom_adj effectively one should continuously monitor oom_score of all
the processes, which is a complex moving target and keep on adjusting the
oom_adj of many tasks which still cannot guarantee the order. This controller
is deterministic and hence easier to use.

Thanks
Nikanth

2009-01-22 05:28:41

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009 10:43:12 +0530
Nikanth Karthikesan <[email protected]> wrote:

> On Thursday 22 January 2009 08:58:43 KAMEZAWA Hiroyuki wrote:
> > On Wed, 21 Jan 2009 16:38:21 +0530
> >
> > Nikanth Karthikesan <[email protected]> wrote:
> > > As Alan Cox suggested/wondered in this thread,
> > > http://lkml.org/lkml/2009/1/12/235 , this is a container group based
> > > approach to override the oom killer selection without losing all the
> > > benefits of the current oom killer heuristics and oom_adj interface.
> > >
> > > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill
> > > the process using the usual badness value but only within the cgroup with
> > > the maximum value for oom.victim before killing any process from a cgroup
> > > with a lesser oom.victim number. Oom killing could be disabled by setting
> > > oom.victim=0.
> > >
> > > Signed-off-by: Nikanth Karthikesan <[email protected]>
> >
> > Assume following
> > - the usar can tell "which process should be killed at first"
> >
> > What is the difference between oom_adj and this cgroup to users ?
>
> It is next to impossible to specify the order among say 10 memory hogging
> tasks using oom_adj. Using this oom-controller users can specify the exact
> order.
>
> > If oom_adj is hard to use, making it simpler is a good way, I think.
> > rather than adding new complication.
> >
> > It seems both of oom_adj and this cgroup will be hard-to-use functions
> > for usual system administrators. But no better idea than using memcg
> > and committing memory usage.
> >
>
> To use oom_adj effectively one should continuously monitor oom_score of all
> the processes, which is a complex moving target and keep on adjusting the
> oom_adj of many tasks which still cannot guarantee the order. This controller
> is deterministic and hence easier to use.
>

Okay, thank you for explanation :)
I think it's better to explain "why this is much easier to use rather
than oom_adj and what is the benefit to users." in your patch description
and to improve your documentation.

+But it is very difficult to suggest an order among tasks to be killed during
+Out Of Memory situation. The OOM Killer controller aids in doing that.

As.
Difference from oom_adj:
This allows users to specify "strict order" of oom-kill's select-bad-process
operation. While oom_adj just works as a hint for the kernel, OOM Killer
Controller gives users full control.

In general, it's very hard to specify oom-kill order of several applications
only by oom_adj because it's just affects "badness" calculation.

A my English skill is poor, you'll be able to write better text ;)

Regards,
-Kame

2009-01-22 05:39:58

by Arve Hjønnevåg

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, Jan 21, 2009 at 9:13 PM, Nikanth Karthikesan <[email protected]> wrote:
> To use oom_adj effectively one should continuously monitor oom_score of all
> the processes, which is a complex moving target and keep on adjusting the
> oom_adj of many tasks which still cannot guarantee the order. This controller
> is deterministic and hence easier to use.
>

Why not add an option to make oom_adj ensure strict ordering instead?

--
Arve Hj?nnev?g

2009-01-22 06:14:40

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 10:57:21 KAMEZAWA Hiroyuki wrote:
> On Thu, 22 Jan 2009 10:43:12 +0530
>
> Nikanth Karthikesan <[email protected]> wrote:
> > On Thursday 22 January 2009 08:58:43 KAMEZAWA Hiroyuki wrote:
> > > On Wed, 21 Jan 2009 16:38:21 +0530
> > >
> > > Nikanth Karthikesan <[email protected]> wrote:
> > > > As Alan Cox suggested/wondered in this thread,
> > > > http://lkml.org/lkml/2009/1/12/235 , this is a container group based
> > > > approach to override the oom killer selection without losing all the
> > > > benefits of the current oom killer heuristics and oom_adj interface.
> > > >
> > > > It adds a tunable oom.victim to the oom cgroup. The oom killer will
> > > > kill the process using the usual badness value but only within the
> > > > cgroup with the maximum value for oom.victim before killing any
> > > > process from a cgroup with a lesser oom.victim number. Oom killing
> > > > could be disabled by setting oom.victim=0.
> > > >
> > > > Signed-off-by: Nikanth Karthikesan <[email protected]>
> > >
> > > Assume following
> > > - the usar can tell "which process should be killed at first"
> > >
> > > What is the difference between oom_adj and this cgroup to users ?
> >
> > It is next to impossible to specify the order among say 10 memory hogging
> > tasks using oom_adj. Using this oom-controller users can specify the
> > exact order.
> >
> > > If oom_adj is hard to use, making it simpler is a good way, I think.
> > > rather than adding new complication.
> > >
> > > It seems both of oom_adj and this cgroup will be hard-to-use functions
> > > for usual system administrators. But no better idea than using memcg
> > > and committing memory usage.
> >
> > To use oom_adj effectively one should continuously monitor oom_score of
> > all the processes, which is a complex moving target and keep on adjusting
> > the oom_adj of many tasks which still cannot guarantee the order. This
> > controller is deterministic and hence easier to use.
>
> Okay, thank you for explanation :)
> I think it's better to explain "why this is much easier to use rather
> than oom_adj and what is the benefit to users." in your patch description
> and to improve your documentation.
>
> +But it is very difficult to suggest an order among tasks to be killed
> during +Out Of Memory situation. The OOM Killer controller aids in doing
> that.
>
> As.
> Difference from oom_adj:
> This allows users to specify "strict order" of oom-kill's
> select-bad-process operation. While oom_adj just works as a hint for the
> kernel, OOM Killer Controller gives users full control.
>
> In general, it's very hard to specify oom-kill order of several
> applications only by oom_adj because it's just affects "badness"
> calculation.
>

Yes, this could be added to the patch description or the documentation.

Thanks
Nikanth

2009-01-22 06:14:56

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 11:09:45 Arve Hj?nnev?g wrote:
> On Wed, Jan 21, 2009 at 9:13 PM, Nikanth Karthikesan <[email protected]>
wrote:
> > To use oom_adj effectively one should continuously monitor oom_score of
> > all the processes, which is a complex moving target and keep on adjusting
> > the oom_adj of many tasks which still cannot guarantee the order. This
> > controller is deterministic and hence easier to use.
>
> Why not add an option to make oom_adj ensure strict ordering instead?

This could be done in 2 ways.
1. Make oom_adj itself strict.(based on some other parameter?)
- Adds to confusion whether the current oom_adj is a strict value or the usual
suggestion.
- It would disable the oom_adj suggestion which could have been used till now.
- It is a public interface, and changing that might break some one's script.

2. Add addtional parameter, say /proc/<pid>/oom_order
- Not easy to use.
- Say I had assigned the oom.victim to a task and it had forked a lot. Now to
change the value for all the tasks it is easier with cgroups.
- Some optimization that Kame specified earlier would be harder to achieve.

Basically oom-controller implements option 2, using cgroups which can be
thought of as a modern interface for proc. Also it could be used along with
other cgroup controllers like the group scheduler. Say you have 2 groups of
tasks, clubed as entertainment and science, you could use the group scheduler
to give more CPU bandwidth to science and instruct oom-controller to kill
entertainment tasks in case of OOM situation.

Thanks
Nikanth

2009-01-22 06:29:32

by Arve Hjønnevåg

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wed, Jan 21, 2009 at 10:12 PM, Nikanth Karthikesan <[email protected]> wrote:
> On Thursday 22 January 2009 11:09:45 Arve Hj?nnev?g wrote:
>> On Wed, Jan 21, 2009 at 9:13 PM, Nikanth Karthikesan <[email protected]>
> wrote:
>> > To use oom_adj effectively one should continuously monitor oom_score of
>> > all the processes, which is a complex moving target and keep on adjusting
>> > the oom_adj of many tasks which still cannot guarantee the order. This
>> > controller is deterministic and hence easier to use.
>>
>> Why not add an option to make oom_adj ensure strict ordering instead?
>
> This could be done in 2 ways.
> 1. Make oom_adj itself strict.(based on some other parameter?)
> - Adds to confusion whether the current oom_adj is a strict value or the usual
> suggestion.
> - It would disable the oom_adj suggestion which could have been used till now.
> - It is a public interface, and changing that might break some one's script.
>
> 2. Add addtional parameter, say /proc/<pid>/oom_order
> - Not easy to use.
> - Say I had assigned the oom.victim to a task and it had forked a lot. Now to
> change the value for all the tasks it is easier with cgroups.
> - Some optimization that Kame specified earlier would be harder to achieve.
>

Both options would work for us, but option 1 require no change to our
user space code. I agree that some operations are easier with a
cgroups approach, but since we don't perform these operations it would
be nice to not require cgroups to control the oom killer.

--
Arve Hj?nnev?g

2009-01-22 06:45:09

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 11:59:20 Arve Hj?nnev?g wrote:
> On Wed, Jan 21, 2009 at 10:12 PM, Nikanth Karthikesan <[email protected]>
wrote:
> > On Thursday 22 January 2009 11:09:45 Arve Hj?nnev?g wrote:
> >> On Wed, Jan 21, 2009 at 9:13 PM, Nikanth Karthikesan <[email protected]>
> >
> > wrote:
> >> > To use oom_adj effectively one should continuously monitor oom_score
> >> > of all the processes, which is a complex moving target and keep on
> >> > adjusting the oom_adj of many tasks which still cannot guarantee the
> >> > order. This controller is deterministic and hence easier to use.
> >>
> >> Why not add an option to make oom_adj ensure strict ordering instead?
> >
> > This could be done in 2 ways.
> > 1. Make oom_adj itself strict.(based on some other parameter?)
> > - Adds to confusion whether the current oom_adj is a strict value or the
> > usual suggestion.
> > - It would disable the oom_adj suggestion which could have been used till
> > now. - It is a public interface, and changing that might break some one's
> > script.
> >
> > 2. Add addtional parameter, say /proc/<pid>/oom_order
> > - Not easy to use.
> > - Say I had assigned the oom.victim to a task and it had forked a lot.
> > Now to change the value for all the tasks it is easier with cgroups.
> > - Some optimization that Kame specified earlier would be harder to
> > achieve.
>
> Both options would work for us, but option 1 require no change to our
> user space code.

Some scripts might be assuming the oom_adj will always be -17 to +15. So not
more than 32+1 levels or order is possible. Yes it should be enough mostly.
But incase you want to leave space between for adding tasks in between, one
has to take extra care or do more work. And someone might still assume old
behaviour and by seeing oom_score and oom_adj he might expect a different
behaviour. And if someone wants the old behaviour, we have to provide an
aditional variable to enable/disable this.

> I agree that some operations are easier with a
> cgroups approach, but since we don't perform these operations it would
> be nice to not require cgroups to control the oom killer.

We would perform these operations if these would be available and easier to
use, so it would be nice to use cgroups.

Thanks
Nikanth

2009-01-22 08:46:19

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Nikanth Karthikesan wrote:

> No, this is not specific to memcg or cpuset cases alone. The same needless
> kills will take place even without memcg or cpuset when an administrator
> specifies a light memory consumer to be killed before a heavy memory user. But
> it is up to the administrator to use it wisely.

You can't specify different behavior for an oom cgroup depending on what
type of oom it is, which is the problem with this proposal.

For example, if your task triggers an oom as the result of its exclusive
cpuset placement, the oom killer should prefer to kill a task within that
cpuset to allow for future memory freeing.

So, with your proposal, an administrator can specify the oom priority of
an entire aggregate of tasks but the behavior may not be desired for a
cpuset-constrained oom, while it may be perfectly legitimate for a global
unconstrained oom.

I can specify a higher oom priority for a cpuset because its jobs are less
critical and I would prefer it gets killed in a system-wide oom, but any
other cpuset that ooms will needlessly kill these tasks when there is no
benefit.

> We also provide a panic_on_oom
> option that an administrator could use, not just to kill few more tasks but
> all tasks in the system ;)
>

panic_on_oom allows you to specify whether you want to panic on any oom or
on global non-constrained ooms, as well. When the sysctl is not set to 2,
it is a no-op in a cpuset, mempolicy, or memcg oom condition.

2009-01-22 09:26:04

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 14:13:38 David Rientjes wrote:
> On Thu, 22 Jan 2009, Nikanth Karthikesan wrote:
> > No, this is not specific to memcg or cpuset cases alone. The same
> > needless kills will take place even without memcg or cpuset when an
> > administrator specifies a light memory consumer to be killed before a
> > heavy memory user. But it is up to the administrator to use it wisely.
>
> You can't specify different behavior for an oom cgroup depending on what
> type of oom it is, which is the problem with this proposal.
>

No. This does not disable any such special selection criteria which is used
without this controller.

> For example, if your task triggers an oom as the result of its exclusive
> cpuset placement, the oom killer should prefer to kill a task within that
> cpuset to allow for future memory freeing.
>
> So, with your proposal, an administrator can specify the oom priority of
> an entire aggregate of tasks but the behavior may not be desired for a
> cpuset-constrained oom, while it may be perfectly legitimate for a global
> unconstrained oom.
>
> I can specify a higher oom priority for a cpuset because its jobs are less
> critical and I would prefer it gets killed in a system-wide oom, but any
> other cpuset that ooms will needlessly kill these tasks when there is no
> benefit.
>

This patch just chooses the task with highest oom.victim among those tasks
which would have been chosen without this controller. So all the "kill within
memcg/cpuset" should work as always! It should just kill a task within the
memcg with highest oom.victim.

Thanks
Nikanth

2009-01-22 09:42:35

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Nikanth Karthikesan wrote:

> > You can't specify different behavior for an oom cgroup depending on what
> > type of oom it is, which is the problem with this proposal.
> >
>
> No. This does not disable any such special selection criteria which is used
> without this controller.
>

I didn't say it disabled it; the cpuset preference is actually implemented
in the badness() score and not specifically excluded in
select_bad_process(). That's because it's quite possible that a task has
allocated memory in a cpuset and then either moved to a separate cpuset or
had it's mems_allowed changed.

Please try it and you'll see. Create two cpusets, cpuset A and cpuset B.
Elevate cpuset A's oom.victim value and then trigger an oom in cpuset B.
Your patch will cause a task from cpuset A to be killed for a cpuset B
triggered oom which, more often than not, will not lead to future memory
freeing.

It's quite possible that cpuset A would be preferred to be killed in a
global unconstrained oom condition, however. That's the only reason why
one would elevate its oom.victim score to begin with. But it doesn't work
for cpuset-constrained ooms.

It's not going to help if it I explain it further and you don't try it out
on your own. Thanks.

2009-01-22 09:50:38

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 12:43:38AM -0800, David Rientjes ([email protected]) wrote:
> For example, if your task triggers an oom as the result of its exclusive
> cpuset placement, the oom killer should prefer to kill a task within that
> cpuset to allow for future memory freeing.

This it not true for all cases. What if you do need to start this task
and free something else outside the given set? This should be an
administrative decision and not forced by the kernel. We used to have it
that way, but it does not mean that it is the only correct way to do the
things.

> So, with your proposal, an administrator can specify the oom priority of
> an entire aggregate of tasks but the behavior may not be desired for a
> cpuset-constrained oom, while it may be perfectly legitimate for a global
> unconstrained oom.

In this case administrator will not do this. It is up to him to decide
and not some inner kernel policy.

--
Evgeniy Polyakov

2009-01-22 10:01:54

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Evgeniy Polyakov wrote:

> > For example, if your task triggers an oom as the result of its exclusive
> > cpuset placement, the oom killer should prefer to kill a task within that
> > cpuset to allow for future memory freeing.
>
> This it not true for all cases. What if you do need to start this task
> and free something else outside the given set? This should be an
> administrative decision and not forced by the kernel. We used to have it
> that way, but it does not mean that it is the only correct way to do the
> things.
>

In an exclusive cpuset, a task's memory is restricted to a set of mems
that the administrator has designated. If it is oom, the kernel must free
memory on those nodes or the next allocation will again trigger an oom
(leading to a needlessly killed task that was in a disjoint cpuset).

Really.

> > So, with your proposal, an administrator can specify the oom priority of
> > an entire aggregate of tasks but the behavior may not be desired for a
> > cpuset-constrained oom, while it may be perfectly legitimate for a global
> > unconstrained oom.
>
> In this case administrator will not do this. It is up to him to decide
> and not some inner kernel policy.
>

Then the scope of this new cgroup is restricted to not being used with
cpusets that could oom.

2009-01-22 10:12:51

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 15:09:28 David Rientjes wrote:
> On Thu, 22 Jan 2009, Nikanth Karthikesan wrote:
> > > You can't specify different behavior for an oom cgroup depending on
> > > what type of oom it is, which is the problem with this proposal.
> >
> > No. This does not disable any such special selection criteria which is
> > used without this controller.
>
> I didn't say it disabled it; the cpuset preference is actually implemented
> in the badness() score and not specifically excluded in
> select_bad_process(). That's because it's quite possible that a task has
> allocated memory in a cpuset and then either moved to a separate cpuset or
> had it's mems_allowed changed.
>
> Please try it and you'll see. Create two cpusets, cpuset A and cpuset B.
> Elevate cpuset A's oom.victim value and then trigger an oom in cpuset B.
> Your patch will cause a task from cpuset A to be killed for a cpuset B
> triggered oom which, more often than not, will not lead to future memory
> freeing.
>
> It's quite possible that cpuset A would be preferred to be killed in a
> global unconstrained oom condition, however. That's the only reason why
> one would elevate its oom.victim score to begin with. But it doesn't work
> for cpuset-constrained ooms.
>
> It's not going to help if it I explain it further and you don't try it out
> on your own. Thanks.

Thanks for the clear explanation. Cpuset does it by reducing the badness to
1/8th for tasks. So using oom-controller could kill some innocent processes on
some other cpuset!

But it is possible to have the same effect with oom_adj, having oom_adj=4 for
a task on a diff cpuset will do the same(assuming they have similar badness).

I think cpusets preference could be improved, not to depend on badness, with
something similar to what memcg does. With or without adding overhead of
tracking processes that has memory from a node.

Thanks
Nikanth

2009-01-22 10:14:36

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 02:00:55AM -0800, David Rientjes ([email protected]) wrote:
>
> In an exclusive cpuset, a task's memory is restricted to a set of mems
> that the administrator has designated. If it is oom, the kernel must free
> memory on those nodes or the next allocation will again trigger an oom
> (leading to a needlessly killed task that was in a disjoint cpuset).
>
> Really.

The whole point of oom-killer is to kill the most appropriate task to
free the memory. And while task is selected system-wide and some
tunables are added to tweak the behaviour local to some subsystems, this
cpuset feature is hardcoded into the selection algorithm.
And when some tunable starts doing own calculation, behaviour of this
hardcoded feature changes.

This is intended to change it. Because admin has to have ability to tune
system the way he needs and not some special hueristics, which may not
work all the time.

That is the point against cpuset argument. Make it tunable the same way
we have oom_adj and/or this cgroup order feature.

> > In this case administrator will not do this. It is up to him to decide
> > and not some inner kernel policy.
> >
>
> Then the scope of this new cgroup is restricted to not being used with
> cpusets that could oom.

These are perpendicular tasks - cpusets limit one area of the oom
handling, cgroup order - another. Some people needs cpusets, others want
cgroups. cpusets are not something exceptional so that only they have to
be taken into account when doing system-wide operation like OOM
condition handling.

--
Evgeniy Polyakov

2009-01-22 10:21:20

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Nikanth Karthikesan wrote:

> I think cpusets preference could be improved, not to depend on badness, with
> something similar to what memcg does. With or without adding overhead of
> tracking processes that has memory from a node.
>

We actually used to do that: we excluded all tasks that did not share the
same cpuset in select_bad_process(). That exclusion was reimplemented as
a preference in badness() since, again, it is quite possible that a large
memory-hogging task without a sufficient oom_adj score, as you mentioned,
has allocated memory on the cpuset's nodes before being moved to a
different cpuset or changing its set of allowable nodes.

I think you would find the per-cgroup oom notifier patch[*] of interest.
It seems to have been dropped after some discussion on improvements, but
allows you to defer all of these decisions to userspace. Would something
like that fix your problem?

[*] http://marc.info/?l=linux-mm&m=122575082227252

2009-01-22 10:28:16

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Evgeniy Polyakov wrote:

> > In an exclusive cpuset, a task's memory is restricted to a set of mems
> > that the administrator has designated. If it is oom, the kernel must free
> > memory on those nodes or the next allocation will again trigger an oom
> > (leading to a needlessly killed task that was in a disjoint cpuset).
> >
> > Really.
>
> The whole point of oom-killer is to kill the most appropriate task to
> free the memory. And while task is selected system-wide and some
> tunables are added to tweak the behaviour local to some subsystems, this
> cpuset feature is hardcoded into the selection algorithm.

Of course, because the oom killer must be aware that tasks in disjoint
cpusets are more likely than not to result in no memory freeing for
current's subsequent allocations.

> And when some tunable starts doing own calculation, behaviour of this
> hardcoded feature changes.
>

Yes, it is possible to elevate oom_adj scores to override the cpuset
preference. That's actually intended since it is now possible for the
administrator to specify that, against the belief of the kernel, that
killing a task will free memory in these cpuset-constrained ooms. That's
probably because it has either been moved to a different cpuset or its set
of allowable nodes is dynamic.

> > Then the scope of this new cgroup is restricted to not being used with
> > cpusets that could oom.
>
> These are perpendicular tasks - cpusets limit one area of the oom
> handling, cgroup order - another. Some people needs cpusets, others want
> cgroups. cpusets are not something exceptional so that only they have to
> be taken into account when doing system-wide operation like OOM
> condition handling.
>

A cpuset is a cgroup. If I am using cpusets, this patch fails to
adequately allow me to describe my oom preferences for both
cpuset-constrained ooms and global unconstrained ooms, which is a major
drawback.

I would encourage you to look at the per-cgroup oom notifier patch[*] that
defers most of these decisions to userspace. Given your interest in
priority based oom preferences as exhibited by your oom_victim patch, I
think you'll find it of interest since it allows you much greater
flexibility than you could ever hope for from the kernel's heuristics.

[*] http://marc.info/?l=linux-mm&m=122575082227252

2009-01-22 13:21:46

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 02:27:19AM -0800, David Rientjes ([email protected]) wrote:
> > The whole point of oom-killer is to kill the most appropriate task to
> > free the memory. And while task is selected system-wide and some
> > tunables are added to tweak the behaviour local to some subsystems, this
> > cpuset feature is hardcoded into the selection algorithm.
>
> Of course, because the oom killer must be aware that tasks in disjoint
> cpusets are more likely than not to result in no memory freeing for
> current's subsequent allocations.

And if we replace cpuset with cgroup (or anything else), nothing
changes, so why this was made so special?

> > And when some tunable starts doing own calculation, behaviour of this
> > hardcoded feature changes.
> >
>
> Yes, it is possible to elevate oom_adj scores to override the cpuset
> preference. That's actually intended since it is now possible for the
> administrator to specify that, against the belief of the kernel, that
> killing a task will free memory in these cpuset-constrained ooms. That's
> probably because it has either been moved to a different cpuset or its set
> of allowable nodes is dynamic.

Yes, admin has to be aware of some obscure inner kernel policy which he
may not be using to be able to tune system-wide behaviour... Right now
it is not even documented :)

> > These are perpendicular tasks - cpusets limit one area of the oom
> > handling, cgroup order - another. Some people needs cpusets, others want
> > cgroups. cpusets are not something exceptional so that only they have to
> > be taken into account when doing system-wide operation like OOM
> > condition handling.
>
> A cpuset is a cgroup. If I am using cpusets, this patch fails to
> adequately allow me to describe my oom preferences for both
> cpuset-constrained ooms and global unconstrained ooms, which is a major
> drawback.
>
> I would encourage you to look at the per-cgroup oom notifier patch[*] that
> defers most of these decisions to userspace. Given your interest in
> priority based oom preferences as exhibited by your oom_victim patch, I
> think you'll find it of interest since it allows you much greater
> flexibility than you could ever hope for from the kernel's heuristics.
>
> [*] http://marc.info/?l=linux-mm&m=122575082227252

Having userspace to decide which task to kill may not work in some cases
at all (when task is swapped and we need to kill someone to get the mem
to swap out the task, which will make that decision).

+ /* If we're planning to retry, we should wake
+ * up any userspace waiter in order to let it
+ * handle the OOM
+ */
+ wake_up_all(&cs->oom_wait);

You must be kidding :)
If by the 'userspace' you do not mean special kernel thread, but in
this case there is no difference compared to existing in-kernel code.

At OOM time there is no userspace. We can not wakeup anyone or send (and
expect it will be received) some notification. The only alive entity in
the system is the kernel, who has to decide who must be killed based on
some data provided by administrator (or by default with some
heuristics).

--
Evgeniy Polyakov

2009-01-22 20:30:38

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, 22 Jan 2009, Evgeniy Polyakov wrote:

> > Of course, because the oom killer must be aware that tasks in disjoint
> > cpusets are more likely than not to result in no memory freeing for
> > current's subsequent allocations.
>
> And if we replace cpuset with cgroup (or anything else), nothing
> changes, so why this was made so special?
>

I don't know what you're talking about, cpusets are simply a client of the
cgroup interface.

The problem that I'm identifying with this change is that the
user-specified priority (in terms of oom.victim settings) conflicts when
the kernel encounters a system-wide unconstrained oom versus a
cpuset-constrained oom.

The oom.victim priority may well be valid for a system-wide condition
where a specific aggregate of tasks are less critical than others and
would be preferred to be killed. But that doesn't help in a
cpuset-constrained oom on the same machine if it doesn't lead to future
memory freeing.

Cpusets are special in this regard because the oom killer's heuristics in
this case are based on a function of the oom triggering task, current. We
check for tasks sharing current->mems_allowed so that we are freeing
memory that the task can eventually use. So this new oom cgroup cannot be
used on any machine with one or more cpusets that may oom without the
possibility of needlessly killing tasks.

> Having userspace to decide which task to kill may not work in some cases
> at all (when task is swapped and we need to kill someone to get the mem
> to swap out the task, which will make that decision).
>

Yes, and it's always possible for userspace to defer back to the kernel in
such cases. But in the majority of cases that you seem to be interested
in, this type of notification system seems like it would work quite well:

- to replace your oom_victim patch, your userspace handler could simply
issue a SIGKILL to the chosen job in place of the oom killer. The
added benefit here is that your userspace handler could actually
support a prioritized list so if one application isn't running, it
could check another, and another, etc., and

- to replace this oom.victim patch, the userspace handler could check
for the presence of the aggregate of tasks that would have appeared
attached to the new cgroup and issue the same SIGKILL.

> + /* If we're planning to retry, we should wake
> + * up any userspace waiter in order to let it
> + * handle the OOM
> + */
> + wake_up_all(&cs->oom_wait);
>
> You must be kidding :)
> If by the 'userspace' you do not mean special kernel thread, but in
> this case there is no difference compared to existing in-kernel code.
>

That wakes up any userspace notifier that you've attached to the cgroup
representing the tasks you're interested in controlling oom responses for.
Your userspace application can then effectively take the place of the oom
killer by expanding a cpuset's memory, elevating a memory controller
reservation, killing current, or using its own heuristics to respond to
the problem.

But we should probably discuss the implementation of the per-cgroup oom
notifier in the thread dedicated to it, not this one.

> At OOM time there is no userspace. We can not wakeup anyone or send (and
> expect it will be received) some notification. The only alive entity in
> the system is the kernel, who has to decide who must be killed based on
> some data provided by administrator (or by default with some
> heuristics).
>

This is completely and utterly wrong. If you had read the code
thoroughly, you would see that the page allocation is deferred until
userspace has the opportunity to respond. That time in deferral isn't
absurd, it would take time for the oom killer's exiting task to free
memory anyway. The change simply allows a delay between a page
allocation failure and invoking the oom killer.

It will even work on UP systems since the page allocator has multiple
reschedule points where a dedicated or monitoring application attached to
the cgroup could add or free memory itself so the allocation succeeds when
current is rescheduled.

2009-01-22 21:06:24

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 12:28:45PM -0800, David Rientjes ([email protected]) wrote:
> I don't know what you're talking about, cpusets are simply a client of the
> cgroup interface.

Only the fact that cpusets have _very_ special meaning in the oom-killer
codepath, while it should be just another tunable (if it should be
special code at all at the first place, why there were no objection and
argument, that tasks could have different oom_adj?).

> The problem that I'm identifying with this change is that the
> user-specified priority (in terms of oom.victim settings) conflicts when
> the kernel encounters a system-wide unconstrained oom versus a
> cpuset-constrained oom.
>
> The oom.victim priority may well be valid for a system-wide condition
> where a specific aggregate of tasks are less critical than others and
> would be preferred to be killed. But that doesn't help in a
> cpuset-constrained oom on the same machine if it doesn't lead to future
> memory freeing.

And that is The Main Point. Not to help some subsystem, but be useful
for others. It is different tunable. It has really nothing with cpusets.
It is not aimed to help or make things worse for the cpusets. I even do
not understand why this very specific case not used by the wast majority
of the users was even rised in the discussion of the system-wide
oom-killer bahaviour and its more precise cgroup tunable.

This is different. It has its own tunable, which is used when cpusets
are enabled. Discussed tunable should be used at its own time. Admin has
a right to decide how he wants to tune oom-killer behaviour. And the
only point I hear, is that something does not help when we have cpusets
enabled.

> Cpusets are special in this regard because the oom killer's heuristics in
> this case are based on a function of the oom triggering task, current. We
> check for tasks sharing current->mems_allowed so that we are freeing
> memory that the task can eventually use. So this new oom cgroup cannot be
> used on any machine with one or more cpusets that may oom without the
> possibility of needlessly killing tasks.

Then do not use anything else when cpusets are enabled.
Do not compile SuperH cpu support when you do not have such processes.
Do not enable iscsi support if you do not need this.
Do not enable cgroup oom-killer tunable when you want cpuset.

> > Having userspace to decide which task to kill may not work in some cases
> > at all (when task is swapped and we need to kill someone to get the mem
> > to swap out the task, which will make that decision).
> >
>
> Yes, and it's always possible for userspace to defer back to the kernel in
> such cases. But in the majority of cases that you seem to be interested
> in, this type of notification system seems like it would work quite well:
>
> - to replace your oom_victim patch, your userspace handler could simply
> issue a SIGKILL to the chosen job in place of the oom killer. The
> added benefit here is that your userspace handler could actually
> support a prioritized list so if one application isn't running, it
> could check another, and another, etc., and
>
> - to replace this oom.victim patch, the userspace handler could check
> for the presence of the aggregate of tasks that would have appeared
> attached to the new cgroup and issue the same SIGKILL.

Besides the fact that it is possible that userspace will not even wake
up. Since to wake it up we have to swap it out, which in turn needs some
memory, which we do not have, since we are in, hmm, Out-Of-Memory
condition.

There may be cases when this will work, I do not argue. But so far the
most common case I imagine is when it will not be able to wake up :)

> > If by the 'userspace' you do not mean special kernel thread, but in
> > this case there is no difference compared to existing in-kernel code.
>
> That wakes up any userspace notifier that you've attached to the cgroup
> representing the tasks you're interested in controlling oom responses for.
> Your userspace application can then effectively take the place of the oom
> killer by expanding a cpuset's memory, elevating a memory controller
> reservation, killing current, or using its own heuristics to respond to
> the problem.
>
> But we should probably discuss the implementation of the per-cgroup oom
> notifier in the thread dedicated to it, not this one.

Please explain, how task will make that decision (like freeing some
ram), when it can not be even swapped out? It is a great idea to be able
to monitor memory usage, but relying on that just does not work.

> > At OOM time there is no userspace. We can not wakeup anyone or send (and
> > expect it will be received) some notification. The only alive entity in
> > the system is the kernel, who has to decide who must be killed based on
> > some data provided by administrator (or by default with some
> > heuristics).
> >
>
> This is completely and utterly wrong. If you had read the code
> thoroughly, you would see that the page allocation is deferred until
> userspace has the opportunity to respond. That time in deferral isn't
> absurd, it would take time for the oom killer's exiting task to free
> memory anyway. The change simply allows a delay between a page
> allocation failure and invoking the oom killer.
>
> It will even work on UP systems since the page allocator has multiple
> reschedule points where a dedicated or monitoring application attached to
> the cgroup could add or free memory itself so the allocation succeeds when
> current is rescheduled.

Yup, it will wait for some timeout while userspace will respond. The
problem is that it will not in some cases. And so far I think (and I may
be wrong), that it will not be able to respond most of the time.

In every oom-condition I worked with, system was completely unusable
before oom-killer started to berserk several processes. They were not
able to allocate some data to read network input (atomic context) and to
even read a console input.


What I want to say is the simple matter of the iteraction within the
kernel oom-handler. Let it be as smart or as stupid as admin decided. I
have no objection against letting it to get some knowledge from the
outsude of its own world (like wait for userspace to respond). I vote by
both hands for implementing some kind of notification mechanism which
could be used by the userspace to get oom-notifications in particular
and to watch its (or system-wide) memory usage in general.

But all those things should be additional tunables. Which do not even
remotely, even theoretically allow a lockup. So userspace notifications
are great as long as this is not the only way to have a progress in the
OOM situation. And the highest priority in this case should be kernel
and its tunables (like proposed cgroup patch), since relying on
userspace in this case may be fatal. But of course it can be enabled and
contribute.

--
Evgeniy Polyakov

2009-01-22 21:36:15

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Fri, 23 Jan 2009, Evgeniy Polyakov wrote:

> Only the fact that cpusets have _very_ special meaning in the oom-killer
> codepath, while it should be just another tunable (if it should be
> special code at all at the first place, why there were no objection and
> argument, that tasks could have different oom_adj?).
>

That's because the oom killer needs to kill a task that will allow future
memory freeing on current->mems_allowed in these cases. Appropriate
oom_adj scores can override that preference, it's up to the administrator.

> > The oom.victim priority may well be valid for a system-wide condition
> > where a specific aggregate of tasks are less critical than others and
> > would be preferred to be killed. But that doesn't help in a
> > cpuset-constrained oom on the same machine if it doesn't lead to future
> > memory freeing.
>
> And that is The Main Point. Not to help some subsystem, but be useful
> for others. It is different tunable. It has really nothing with cpusets.

It doesn't work with any system that is running with cpusets, that's the
problem with accepting such code. There needs to be clear and convincing
evidence that a userspace oom notifier, which could handle all possible
cases and is much more robust and flexible, would not suffice for such
situations.

> Then do not use anything else when cpusets are enabled.
> Do not compile SuperH cpu support when you do not have such processes.
> Do not enable iscsi support if you do not need this.
> Do not enable cgroup oom-killer tunable when you want cpuset.
>

How do I prioritize oom killing if my system is running cpusets, then?
Your answer is that I can't, since this new cgroup doesn't work with them;
so this solution is incomplete. The oom killer notification patch[*] is
much more functional and does allow this problem to be solved on all types
of systems because the implementation of such a handler is left to the
administrator.

[*] http://marc.info/?l=linux-mm&m=122575082227252

> Please explain, how task will make that decision (like freeing some
> ram), when it can not be even swapped out? It is a great idea to be able
> to monitor memory usage, but relying on that just does not work.
>

The userspace handler is a schedulable task resident in memory that, with
any sane implementation, would not require additional memory when running.

> > This is completely and utterly wrong. If you had read the code
> > thoroughly, you would see that the page allocation is deferred until
> > userspace has the opportunity to respond. That time in deferral isn't
> > absurd, it would take time for the oom killer's exiting task to free
> > memory anyway. The change simply allows a delay between a page
> > allocation failure and invoking the oom killer.
> >
> > It will even work on UP systems since the page allocator has multiple
> > reschedule points where a dedicated or monitoring application attached to
> > the cgroup could add or free memory itself so the allocation succeeds when
> > current is rescheduled.
>
> Yup, it will wait for some timeout while userspace will respond. The
> problem is that it will not in some cases. And so far I think (and I may
> be wrong), that it will not be able to respond most of the time.
>

We've used it quite successfully for over a year, so your hypothesis in
this case may be less than accurate.

> In every oom-condition I worked with, system was completely unusable
> before oom-killer started to berserk several processes. They were not
> able to allocate some data to read network input (atomic context) and to
> even read a console input.
>

If your system failed to allocate GFP_ATOMIC memory, then those are page
allocation failures in the buddy allocator and will actually never invoke
the oom killer since it cannot sleep in that context.

2009-01-22 22:04:56

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 01:35:13PM -0800, David Rientjes ([email protected]) wrote:
> > And that is The Main Point. Not to help some subsystem, but be useful
> > for others. It is different tunable. It has really nothing with cpusets.
>
> It doesn't work with any system that is running with cpusets, that's the
> problem with accepting such code. There needs to be clear and convincing
> evidence that a userspace oom notifier, which could handle all possible
> cases and is much more robust and flexible, would not suffice for such
> situations.

I showed the case when it does not work at all. And then found (in this
mail), that task (part) has to be present in the memory, which means it
will be locked, which in turns will not work with the system which
already locked its range allowed by the limits.

And returning to the oom_adj and cpusets tunables. Why any new process
started in given cpuset can not be tuned by external application or some
script to have bigger/smaller oom_adj parameter? :)

> > Then do not use anything else when cpusets are enabled.
> > Do not compile SuperH cpu support when you do not have such processes.
> > Do not enable iscsi support if you do not need this.
> > Do not enable cgroup oom-killer tunable when you want cpuset.
> >
>
> How do I prioritize oom killing if my system is running cpusets, then?

Just the way it works right now :)
You do not object against patches which improve superh cpu support
with the argument, that it is not possible to enable that feature,
when system does not have superh cpu.

> Your answer is that I can't, since this new cgroup doesn't work with them;
> so this solution is incomplete. The oom killer notification patch[*] is
> much more functional and does allow this problem to be solved on all types
> of systems because the implementation of such a handler is left to the
> administrator.
>
> [*] http://marc.info/?l=linux-mm&m=122575082227252

Let me draw the line in this discussion: people propose a patches to tweak
the oom-killer behaviour in some situation. You say that this does not
cover another case, and thus has to be dropped. You instead propose
another solution, which does not work either in some cases (IMO more
wider).

It may be a good idea, and likely it is (at least its notification
part), but please look a bit wider, since in cases when it does not work,
still there are lots of systems which want to have a proper OOM tunables.

> > Please explain, how task will make that decision (like freeing some
> > ram), when it can not be even swapped out? It is a great idea to be able
> > to monitor memory usage, but relying on that just does not work.
> >
>
> The userspace handler is a schedulable task resident in memory that, with
> any sane implementation, would not require additional memory when running.

And what happens when it can not lock the memory because of the limits?

> > Yup, it will wait for some timeout while userspace will respond. The
> > problem is that it will not in some cases. And so far I think (and I may
> > be wrong), that it will not be able to respond most of the time.
> >
>
> We've used it quite successfully for over a year, so your hypothesis in
> this case may be less than accurate.

You also used oom_adj which was not documented. Is this an argument?

> > In every oom-condition I worked with, system was completely unusable
> > before oom-killer started to berserk several processes. They were not
> > able to allocate some data to read network input (atomic context) and to
> > even read a console input.
> >
>
> If your system failed to allocate GFP_ATOMIC memory, then those are page
> allocation failures in the buddy allocator and will actually never invoke
> the oom killer since it cannot sleep in that context.

Hmm, you likely missed the part in the last line. And in the first two,
where I said that before oom-killer started (and killed some processes,
usually not those which were need, but its a different story). System
just did not have a free memory to have _any_ progress neither in atomic
context, nor in process, so it had to invoke an oom-killer.

In that case userspace just can not reply or even awake. While kernel is
effectively alive if it does not need to allocate a memory. And could
kill some process to free up the ram.

Userspace notifications are great, no problem, but do not rely on them,
since there is a huge world outside the case it works in, which will be
quite unhappy when systems start freezing because oom-killer relied on
the userspace.

--
Evgeniy Polyakov

2009-01-22 22:29:55

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Fri, 23 Jan 2009, Evgeniy Polyakov wrote:

> I showed the case when it does not work at all. And then found (in this
> mail), that task (part) has to be present in the memory, which means it
> will be locked, which in turns will not work with the system which
> already locked its range allowed by the limits.
>

Yes, a userspace oom handler must be sanely implemented.

> And returning to the oom_adj and cpusets tunables. Why any new process
> started in given cpuset can not be tuned by external application or some
> script to have bigger/smaller oom_adj parameter? :)
>

oom_adj scores are separate from the hard-coded and very fundamental
heuristic that we should kill a task that has memory allocated on nodes we
are attempting to free. Anything else would just be stupid.

> > How do I prioritize oom killing if my system is running cpusets, then?
>
> Just the way it works right now :)
> You do not object against patches which improve superh cpu support
> with the argument, that it is not possible to enable that feature,
> when system does not have superh cpu.
>

No, I object against any patch that isn't a complete solution to the
problem being presented. It's purely a matter of good software
engineering practices and in the interest of a long-term maintainable
kernel.

> > The userspace handler is a schedulable task resident in memory that, with
> > any sane implementation, would not require additional memory when running.
>
> And what happens when it can not lock the memory because of the limits?
>

Any sane handler for responding to oom conditions will not require
additional memory from nodes that are under oom, whether that includes all
system memory or a subset, if it is attached to the oom notifier.

> Hmm, you likely missed the part in the last line. And in the first two,
> where I said that before oom-killer started (and killed some processes,
> usually not those which were need, but its a different story). System
> just did not have a free memory to have _any_ progress neither in atomic
> context, nor in process, so it had to invoke an oom-killer.
>

The page allocator cannot invoke the oom killer in atomic context, so this
would be happening in process content where it can sleep. The userspace
oom handler will wake up, handle the condition either by relaxing hardwall
restrictions for either the memory controller or cpusets, or killing a
task itself unless it chooses to defer to the kernel.

> In that case userspace just can not reply or even awake. While kernel is
> effectively alive if it does not need to allocate a memory. And could
> kill some process to free up the ram.
>

Wrong, oom conditions do not preempt task scheduling.

> Userspace notifications are great, no problem, but do not rely on them,
> since there is a huge world outside the case it works in, which will be
> quite unhappy when systems start freezing because oom-killer relied on
> the userspace.
>

I'm quite certain you've spent more time writing emails to me than merging
the patch and testing its possibilities, given your lack of understanding
of its very basic concepts.

2009-01-22 22:53:32

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 02:28:11PM -0800, David Rientjes ([email protected]) wrote:
> > And returning to the oom_adj and cpusets tunables. Why any new process
> > started in given cpuset can not be tuned by external application or some
> > script to have bigger/smaller oom_adj parameter? :)
>
> oom_adj scores are separate from the hard-coded and very fundamental
> heuristic that we should kill a task that has memory allocated on nodes we
> are attempting to free. Anything else would just be stupid.

The right answer is that oom-adj scores are global, while it is required
in some cases to have local groups processes can be arranged into so
that local tunables could be used. cpusets just happend to be those
groups, while it could be in turn wider cgroups with the proposed
or similar patch.

Or those groups could be processes started with special name :)
It just happend that cpusets were the first, so you can say,
that they are so special, while actually they are not.

We already have this, and history does not tolerate conjunctives,
so let's drop this part :)

> No, I object against any patch that isn't a complete solution to the
> problem being presented. It's purely a matter of good software
> engineering practices and in the interest of a long-term maintainable
> kernel.

Is this double standards, since your proposal, if implemented a
little bit different, does not solve the problem at all. And you argue
that uncomplete solution should not be merged. Please note, that I do
not object against notifications, but just want to have a system which
will work in all cases or at least in the majority of them.

> > > The userspace handler is a schedulable task resident in memory that, with
> > > any sane implementation, would not require additional memory when running.
> >
> > And what happens when it can not lock the memory because of the limits?
>
> Any sane handler for responding to oom conditions will not require
> additional memory from nodes that are under oom, whether that includes all
> system memory or a subset, if it is attached to the oom notifier.

This reminds me a thread, where modporbe stuck because initrd code is
actually not allowed to write something to the console, and while it
happend that bug is somewhere else, the same argument was rised about
such fun limitations. What about just asking userspace developers not to
write a code, which may lead to OOM conditions :)

Even not arguing 'sane implementation' case, what about process which
happend to be in swap? Its oom handler has to be locked in the memory to
be sucessfully invoked, but this does not work if all allowed to be
locked memory is used.

> > Hmm, you likely missed the part in the last line. And in the first two,
> > where I said that before oom-killer started (and killed some processes,
> > usually not those which were need, but its a different story). System
> > just did not have a free memory to have _any_ progress neither in atomic
> > context, nor in process, so it had to invoke an oom-killer.
> >
>
> The page allocator cannot invoke the oom killer in atomic context, so this
> would be happening in process content where it can sleep. The userspace
> oom handler will wake up, handle the condition either by relaxing hardwall
> restrictions for either the memory controller or cpusets, or killing a
> task itself unless it chooses to defer to the kernel.

Seems like you just do not read what I wrote. Please do. At least the
last sentence :) And what I wrote about swapped and locked memory.

> > In that case userspace just can not reply or even awake. While kernel is
> > effectively alive if it does not need to allocate a memory. And could
> > kill some process to free up the ram.
>
> Wrong, oom conditions do not preempt task scheduling.

I actually did not tell anything about task scheduling.

I said that in the case above, if oom-killer would depend on the
userspace it could not make a progress, since oom-handlers would
not be able to wake up if swapped, and in this case oom-killer
would need just to select one by its own hueristics.

> > Userspace notifications are great, no problem, but do not rely on them,
> > since there is a huge world outside the case it works in, which will be
> > quite unhappy when systems start freezing because oom-killer relied on
> > the userspace.
>
> I'm quite certain you've spent more time writing emails to me than merging
> the patch and testing its possibilities, given your lack of understanding
> of its very basic concepts.

How cute :)
Any other technical arguments of the same strength?

--
Evgeniy Polyakov

2009-01-22 23:25:42

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Fri, Jan 23, 2009 at 01:53:04AM +0300, Evgeniy Polyakov ([email protected]) wrote:
> > I'm quite certain you've spent more time writing emails to me than merging
> > the patch and testing its possibilities, given your lack of understanding
> > of its very basic concepts.
>
> How cute :)
> Any other technical arguments of the same strength?

Its my turn now for the professional statements, let's start with this
technical side: you wanted to send a signal for the process to be killed,
but this will need
1. to allocate a signal, which will deadlock
2. no need to do this for sigill, but it will not work if process is in
unkillable state, while oom-killer clears iirc

Now to the oom-handler: if it will want to free some memory, it will
have to call a syscall with some pointer, which in turn may be in the
swapped area, so handler will be locked.

--
Evgeniy Polyakov

2009-01-23 09:48:22

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thursday 22 January 2009 15:57:19 David Rientjes wrote:
> On Thu, 22 Jan 2009, Evgeniy Polyakov wrote:
> > > In an exclusive cpuset, a task's memory is restricted to a set of mems
> > > that the administrator has designated. If it is oom, the kernel must
> > > free memory on those nodes or the next allocation will again trigger an
> > > oom (leading to a needlessly killed task that was in a disjoint
> > > cpuset).
> > >
> > > Really.
> >
> > The whole point of oom-killer is to kill the most appropriate task to
> > free the memory. And while task is selected system-wide and some
> > tunables are added to tweak the behaviour local to some subsystems, this
> > cpuset feature is hardcoded into the selection algorithm.
>
> Of course, because the oom killer must be aware that tasks in disjoint
> cpusets are more likely than not to result in no memory freeing for
> current's subsequent allocations.
>

Yes, the problem is cpuset does not track the tasks which has allocated from
this node - who has either moved or changed it set of allowable nodes. And
because of that it does not limit oom killing to the tasks with in those tasks
and could kill some innocent tasks at times.

As it is unable to take deterministic decision as memcg does, it plays with
badness value and only suggests but does not restricts within those tasks that
has to be killed.

This bug is present even without this patch.

> > And when some tunable starts doing own calculation, behaviour of this
> > hardcoded feature changes.
>
> Yes, it is possible to elevate oom_adj scores to override the cpuset
> preference. That's actually intended since it is now possible for the
> administrator to specify that, against the belief of the kernel, that
> killing a task will free memory in these cpuset-constrained ooms. That's
> probably because it has either been moved to a different cpuset or its set
> of allowable nodes is dynamic.
>

This patch adds one more easier way for the administrator to over-ride.

> > > Then the scope of this new cgroup is restricted to not being used with
> > > cpusets that could oom.
> >
> > These are perpendicular tasks - cpusets limit one area of the oom
> > handling, cgroup order - another. Some people needs cpusets, others want
> > cgroups. cpusets are not something exceptional so that only they have to
> > be taken into account when doing system-wide operation like OOM
> > condition handling.
>
> A cpuset is a cgroup. If I am using cpusets, this patch fails to
> adequately allow me to describe my oom preferences for both
> cpuset-constrained ooms and global unconstrained ooms, which is a major
> drawback.
>

The current cpuset oom handling has to be fixed and the exact problem of
killing innocent processes exists even without the oom-controller.

Thanks
Nikanth

2009-01-23 10:34:40

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Fri, 23 Jan 2009, Nikanth Karthikesan wrote:

> > Of course, because the oom killer must be aware that tasks in disjoint
> > cpusets are more likely than not to result in no memory freeing for
> > current's subsequent allocations.
> >
>
> Yes, the problem is cpuset does not track the tasks which has allocated from
> this node - who has either moved or changed it set of allowable nodes. And
> because of that it does not limit oom killing to the tasks with in those tasks
> and could kill some innocent tasks at times.
>

Right, the logic to prefer tasks that share the same set of allowable
nodes as the oom-triggering task is implemented in badness() and being in
a completely disjoint cpuset does not specifically exclude a task from
being chosen as the oom killer's victim. That's because, as you said, it
could have allocated memory elsewhere before changing cpusets or its set
of allowable mems.

> As it is unable to take deterministic decision as memcg does, it plays with
> badness value and only suggests but does not restricts within those tasks that
> has to be killed.
>
> This bug is present even without this patch.
>

It's not a bug, it can actually help in a couple instances:

- a much larger memory hogging task is identified in a disjoint cpuset
and the liklihood it has allocated memory elsewhere either previously
or atomically, or

- an administrator tunes the oom_adj value for such a task to describe
the above behavior even for smaller tasks and their liklihood to
allocate outside of their exclusive cpuset.

> This patch adds one more easier way for the administrator to over-ride.
>

Yeah, I know. But the problem with the approach is that it specifies an
oom priority for both global unconstrained ooms and cpuset-constrained
ooms.

It's quite possible with your patch to identify an aggregate of tasks that
should be killed first whenever the system is completely out of memory.
That's great, and solves your problem. But that same system cannot
correctly use cpusets that have the potential to ever oom because your
patch completely overrides the victim priority and could needlessly kill
tasks that will not lead to future memory freeing.

That's my objection to the proposal: it doesn't behave appropriately for
both global unconstrained ooms and cpuset-constrained ooms at the same
time.

I think if you look at the cgroup oom notifier patch I referred you to,
you will find it trivial to implement a handler that can issue a SIGKILL
for an aggregate of tasks and implement the functional equivalent of this
patch in userspace where it belongs. It's nearly impossible to code oom
killer heuristics in the kernel that work for every possible workload and
configuration without some unfortunate casualties.

> The current cpuset oom handling has to be fixed and the exact problem of
> killing innocent processes exists even without the oom-controller.
>

I think the heuristic could be tuned to penalize tasks in disjoint cpusets
a little more, but its implementation as a factor of the badness score is
actually very well placed.

2009-01-23 14:59:16

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Friday 23 January 2009 16:03:49 David Rientjes wrote:
> On Fri, 23 Jan 2009, Nikanth Karthikesan wrote:
> > > Of course, because the oom killer must be aware that tasks in disjoint
> > > cpusets are more likely than not to result in no memory freeing for
> > > current's subsequent allocations.
> >
> > Yes, the problem is cpuset does not track the tasks which has allocated
> > from this node - who has either moved or changed it set of allowable
> > nodes. And because of that it does not limit oom killing to the tasks
> > with in those tasks and could kill some innocent tasks at times.
>
> Right, the logic to prefer tasks that share the same set of allowable
> nodes as the oom-triggering task is implemented in badness() and being in
> a completely disjoint cpuset does not specifically exclude a task from
> being chosen as the oom killer's victim. That's because, as you said, it
> could have allocated memory elsewhere before changing cpusets or its set
> of allowable mems.
>
> > As it is unable to take deterministic decision as memcg does, it plays
> > with badness value and only suggests but does not restricts within those
> > tasks that has to be killed.
> >
> > This bug is present even without this patch.
>
> It's not a bug, it can actually help in a couple instances:
>
> - a much larger memory hogging task is identified in a disjoint cpuset
> and the liklihood it has allocated memory elsewhere either previously
> or atomically, or
>
> - an administrator tunes the oom_adj value for such a task to describe
> the above behavior even for smaller tasks and their liklihood to
> allocate outside of their exclusive cpuset.
>

In other instances, It can actually also kill some innocent tasks unless the
administrator tunes oom_adj, say something like kvm which would have a huge
memory accounted, but might be from a different node altogether. Killing a
single vm is killing all of the processes in that OS ;) Don't you think this
has to be fixed^Wimproved?

> > This patch adds one more easier way for the administrator to over-ride.
>
> Yeah, I know. But the problem with the approach is that it specifies an
> oom priority for both global unconstrained ooms and cpuset-constrained
> ooms.
>
> It's quite possible with your patch to identify an aggregate of tasks that
> should be killed first whenever the system is completely out of memory.
> That's great, and solves your problem. But that same system cannot
> correctly use cpusets that have the potential to ever oom because your
> patch completely overrides the victim priority and could needlessly kill
> tasks that will not lead to future memory freeing.
>
> That's my objection to the proposal: it doesn't behave appropriately for
> both global unconstrained ooms and cpuset-constrained ooms at the same
> time.
>

So you are against specifying order when it is a cpuset-constrained oom. Here
is a revised version of the patch which adds oom.cpuset_constraint, when set
to 1 would disable the ordering imposed by this controller for cpuset
constrained ooms! Will this work for you?

Thanks
Nikanth

From: Nikanth Karthikesan <[email protected]>

Cgroup based OOM killer controller

Signed-off-by: Nikanth Karthikesan <[email protected]>

---

This is a container group based approach to override the oom killer selection
without losing all the benefits of the current oom killer heuristics and
oom_adj interface.

It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
process using the usual badness value but only within the cgroup with the
maximum value for oom.victim before killing any process from a cgroup with a
lesser oom.victim number. Oom killing could be disabled by setting
oom.victim=0.

Difference from oom_adj:
This controller allows users to specify "strict order" for oom kills. While
oom_adj just works as a hint for the kernel, OOM Killer Controller gives users
full control to decide the order in which processes should be killed. It is
very hard to specify oom-kill order for several applications using just
oom_adj because one needs to adjust oom_adj of various task based on oom_score
of several tasks which keeps changing.

CPUSET constrained OOM:
Also the tunable oom.cpuset_constrained when enabled, would disable the
ordering imposed by this controller for cpuset constrained OOMs.

diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt
new file mode 100644
index 0000000..772fb41
--- /dev/null
+++ b/Documentation/cgroups/oom.txt
@@ -0,0 +1,34 @@
+OOM Killer controller
+--- ------ ----------
+
+The OOM killer kills the process based on a set of heuristics such that only
+minimum amount of work done will be lost, a large amount of memory would be
+recovered and minimum no of processes are killed.
+
+The user can adjust the score used to select the processes to be killed using
+/proc/<pid>/oom_adj. Giving it a high score will increase the likelihood of
+this process being killed by the oom-killer. Valid values are in the range
+-16 to +15, plus the special value -17, which disables oom-killing altogether
+for that process.
+
+But it is very difficult to suggest an order among tasks to be killed during
+Out Of Memory situation. The OOM Killer controller aids in doing that.
+
+USAGE
+-----
+
+Mount the oom controller by passing 'oom' when mounting cgroups. Echo
+a value in oom.victim file to change the order. The oom killer would kill
+all the processes in a cgroup with a higher oom.victim value before killing a
+process in a cgroup with lower oom.victim value. Among those tasks with same
+oom.victim value, the usual badness heuristics would be applied. The
+/proc/<pid>/oom_adj still helps adjusting the oom killer score. Also having
+oom.victim = 0 would disable oom killing for the tasks in that cgroup.
+
+Note: If this is used without proper consideration, innocent processes may
+get killed unnecesarily.
+
+CPUSET constrained OOM:
+Setting oom.cpuset_constraint=1 would disable the ordering during a cpuset
+constrained oom. Setting oom.cpuset_constraint=0 would not distinguish
+between a cpuset constrained oom and system wide oom.
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..6944f99 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,4 +59,8 @@ SUBSYS(freezer)
SUBSYS(net_cls)
#endif

+#ifdef CONFIG_CGROUP_OOM
+SUBSYS(oom)
+#endif
+
/* */
diff --git a/include/linux/oomcontrol.h b/include/linux/oomcontrol.h
new file mode 100644
index 0000000..d4c4c72
--- /dev/null
+++ b/include/linux/oomcontrol.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_OOMCONTROL_H
+#define _LINUX_OOMCONTROL_H
+
+#ifdef CONFIG_CGROUP_OOM
+struct oom_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the order to be victimized for this group
+ */
+ u64 victim;
+
+ /*
+ * disable during cpuset constrained oom
+ */
+ bool *cpuset_constraint;
+};
+
+#endif
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 2af8382..99ed0de 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -354,6 +354,15 @@ config CGROUP_DEBUG

Say N if unsure.

+config CGROUP_OOM
+ bool "Oom cgroup subsystem"
+ depends on CGROUPS
+ help
+ This provides a cgroup subsystem which aids controlling
+ the order in which tasks whould be killed during
+ out of memory situations.
+
+
config CGROUP_NS
bool "Namespace cgroup subsystem"
depends on CGROUPS
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..a5d7222 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_OOM) += oomcontrol.o
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba050..555ecb6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/oomcontrol.h>
#include <linux/security.h>

int sysctl_panic_on_oom;
@@ -200,11 +201,15 @@ static inline enum oom_constraint
constrained_alloc(struct zonelist *zonelist,
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned long *ppoints,
- struct mem_cgroup *mem)
+ struct mem_cgroup *mem, int cpuset_constrained)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+#ifdef CONFIG_CGROUP_OOM
+ u64 chosenvictim = 1, taskvictim;
+ bool honour_cpuset_constraint;
+#endif
*ppoints = 0;

do_posix_clock_monotonic_gettime(&uptime);
@@ -257,10 +262,30 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
continue;

points = badness(p, uptime.tv_sec);
+#ifdef CONFIG_CGROUP_OOM
+ taskvictim = (container_of(p->cgroups->subsys[oom_subsys_id],
+ struct oom_cgroup, css))->victim;
+ honour_cpuset_constraint = *(container_of(p->cgroups-
>subsys[oom_subsys_id],
+ struct oom_cgroup, css))-
>cpuset_constraint;
+
+ if (taskvictim > chosenvictim ||
+ (((taskvictim == chosenvictim) ||
+ (cpuset_constrained && honour_cpuset_constraint))
+ && points > *ppoints) ||
+ (taskvictim && !chosen)) {
+
+ chosen = p;
+ *ppoints = points;
+ chosenvictim = taskvictim;
+
+ }
+
+#else
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
}
+#endif
} while_each_thread(g, p);

return chosen;
@@ -431,7 +456,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem,
gfp_t gfp_mask)

read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, mem);
+ p = select_bad_process(&points, mem, 0); /* not cpuset constrained */
if (PTR_ERR(p) == -1UL)
goto out;

@@ -513,7 +538,7 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t
gfp_mask)
/*
* Must be called with tasklist_lock held for read.
*/
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int
cpuset_constrained)
{
if (sysctl_oom_kill_allocating_task) {
oom_kill_process(current, gfp_mask, order, 0, NULL,
@@ -528,7 +553,7 @@ retry:
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
- p = select_bad_process(&points, NULL);
+ p = select_bad_process(&points, NULL, cpuset_constrained);

if (PTR_ERR(p) == -1UL)
return;
@@ -569,7 +594,8 @@ void pagefault_out_of_memory(void)
panic("out of memory from page fault. panic_on_oom is selected.\n");

read_lock(&tasklist_lock);
- __out_of_memory(0, 0); /* unknown gfp_mask and order */
+ /* unknown gfp_mask and order and not cpuset constrained */
+ __out_of_memory(0, 0, 0);
read_unlock(&tasklist_lock);

/*
@@ -623,7 +649,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t
gfp_mask, int order)
panic("out of memory. panic_on_oom is selected\n");
/* Fall-through */
case CONSTRAINT_CPUSET:
- __out_of_memory(gfp_mask, order);
+ __out_of_memory(gfp_mask, order, 1);
break;
}

diff --git a/mm/oomcontrol.c b/mm/oomcontrol.c
new file mode 100644
index 0000000..24ee0da
--- /dev/null
+++ b/mm/oomcontrol.c
@@ -0,0 +1,138 @@
+/*
+ * kernel/cgroup_oom.c - oom handler cgroup.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/oomcontrol.h>
+
+/*
+ * Helper to retrieve oom controller data from cgroup
+ */
+static struct oom_cgroup *oom_css_from_cgroup(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp,
+ oom_subsys_id), struct oom_cgroup,
+ css);
+}
+
+
+static struct cgroup_subsys_state *oom_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct oom_cgroup *oom_css = kzalloc(sizeof(*oom_css), GFP_KERNEL);
+ struct oom_cgroup *parent;
+
+ if (!oom_css)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * if root last/only group to be victimized
+ * else inherit parents value
+ */
+ if (cont->parent == NULL) {
+ oom_css->victim = 1;
+ oom_css->cpuset_constraint =
+ kzalloc(sizeof(*oom_css->cpuset_constraint), GFP_KERNEL);
+ *oom_css->cpuset_constraint = false;
+ } else {
+ parent = oom_css_from_cgroup(cont->parent);
+ oom_css->victim = parent->victim;
+ oom_css->cpuset_constraint = parent->cpuset_constraint;
+ }
+
+ return &oom_css->css;
+}
+
+static void oom_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct oom_cgroup *oom_css = oom_css_from_cgroup(cont);
+ if (cont->parent == NULL)
+ kfree(oom_css->cpuset_constraint);
+ kfree(cont->subsys[oom_subsys_id]);
+}
+
+static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+
+ cgroup_lock();
+
+ (oom_css_from_cgroup(cgrp))->victim = val;
+
+ cgroup_unlock();
+
+ return 0;
+}
+
+static u64 oom_victim_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ u64 victim = (oom_css_from_cgroup(cgrp))->victim;
+
+ return victim;
+}
+
+static int oom_cpuset_write(struct cgroup *cont, struct cftype *cft,
+ const char *buffer)
+{
+ if (buffer[0] == '1' && buffer[1] == 0)
+ *(oom_css_from_cgroup(cont))->cpuset_constraint = true;
+ else if (buffer[0] == '0' && buffer[1] == 0)
+ *(oom_css_from_cgroup(cont))->cpuset_constraint = false;
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static u64 oom_cpuset_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ if (*(oom_css_from_cgroup(cgrp))->cpuset_constraint)
+ return 1;
+ else
+ return 0;
+}
+
+static struct cftype oom_cgroup_files[] = {
+ {
+ .name = "victim",
+ .read_u64 = oom_victim_read,
+ .write_u64 = oom_victim_write,
+ },
+};
+
+static struct cftype oom_cgroup_root_files[] = {
+ {
+ .name = "victim",
+ .read_u64 = oom_victim_read,
+ .write_u64 = oom_victim_write,
+ },
+ {
+ .name = "cpuset_constraint",
+ .read_u64 = oom_cpuset_read,
+ .write_string = oom_cpuset_write,
+ },
+};
+
+static int oom_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int ret;
+
+ if (cont->parent == NULL) {
+ ret = cgroup_add_files(cont, ss, oom_cgroup_root_files,
+ ARRAY_SIZE(oom_cgroup_root_files));
+ } else {
+ ret = cgroup_add_files(cont, ss, oom_cgroup_files,
+ ARRAY_SIZE(oom_cgroup_files));
+ }
+ return ret;
+}
+
+struct cgroup_subsys oom_subsys = {
+ .name = "oom",
+ .subsys_id = oom_subsys_id,
+ .create = oom_create,
+ .destroy = oom_destroy,
+ .populate = oom_populate,
+};

2009-01-23 20:47:16

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Fri, 23 Jan 2009, Nikanth Karthikesan wrote:

> In other instances, It can actually also kill some innocent tasks unless the
> administrator tunes oom_adj, say something like kvm which would have a huge
> memory accounted, but might be from a different node altogether. Killing a
> single vm is killing all of the processes in that OS ;) Don't you think this
> has to be fixed^Wimproved?
>

As previously stated, I think the heuristic to penalize tasks for not
having an intersection with the set of allowable nodes of the oom
triggering task could be made slightly more severe. That's irrelevant to
your patch, though.

> > That's my objection to the proposal: it doesn't behave appropriately for
> > both global unconstrained ooms and cpuset-constrained ooms at the same
> > time.
> >
>
> So you are against specifying order when it is a cpuset-constrained oom. Here
> is a revised version of the patch which adds oom.cpuset_constraint, when set
> to 1 would disable the ordering imposed by this controller for cpuset
> constrained ooms! Will this work for you?
>

No, I don't think it's appropriate to add special exemptions for specific
subsystems to what should be a generic cgroup. I think it is much more
powerful to defer these decisions to userspace so each cgroup can attach
its own handler and implement the necessary decision-making that the
kernel could never perfectly handle for all possible workloads.

It is trivial to implement the equivalent of this particular change as a
userspace handler to SIGKILL all tasks in a specific cgroup when the
cgroup oom handler is woken up at the time of oom. Additionally, it could
also respond in other ways such as adding a node to a cpuset, killing a
less important cgroup, elevate a memory controller limit, send a signal to
your application to release memory, etc.

We also talked about a cgroup /dev/mem_notify device file that you can
poll() and learn of low memory situations so that appropriate action can
be taken even in lowmem situations as opposed to simply oom conditions.

These types of policy decisions belong in userspace.

2009-01-26 19:54:49

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

* Nikanth Karthikesan <[email protected]> [2009-01-21 16:38:21]:

> As Alan Cox suggested/wondered in this thread,
> http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> to override the oom killer selection without losing all the benefits of the
> current oom killer heuristics and oom_adj interface.
>
> It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> process using the usual badness value but only within the cgroup with the
> maximum value for oom.victim before killing any process from a cgroup with a
> lesser oom.victim number. Oom killing could be disabled by setting
> oom.victim=0.

Looking at the patch, I wonder if it is time for user space OOM
notifications that were discussed during the containers mini-summit.
The idea is to inform user space about OOM's and let user space take
action, if no action is taken, the default handler kicks in.

--
Balbir

2009-01-26 19:57:57

by Alan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009 01:24:31 +0530
Balbir Singh <[email protected]> wrote:

> * Nikanth Karthikesan <[email protected]> [2009-01-21 16:38:21]:
>
> > As Alan Cox suggested/wondered in this thread,
> > http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> > to override the oom killer selection without losing all the benefits of the
> > current oom killer heuristics and oom_adj interface.
> >
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> > process using the usual badness value but only within the cgroup with the
> > maximum value for oom.victim before killing any process from a cgroup with a
> > lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
>
> Looking at the patch, I wonder if it is time for user space OOM
> notifications that were discussed during the containers mini-summit.
> The idea is to inform user space about OOM's and let user space take
> action, if no action is taken, the default handler kicks in.

The OLPC folks (Marcelo I believe) posted code for this and I believe
OLPC is using this functionality internally so that under memory pressure
(before we actually hit OOM) programs can respond by doing stuff like
evicting caches.

2009-01-27 07:02:47

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

Hi

> > > As Alan Cox suggested/wondered in this thread,
> > > http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> > > to override the oom killer selection without losing all the benefits of the
> > > current oom killer heuristics and oom_adj interface.
> > >
> > > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> > > process using the usual badness value but only within the cgroup with the
> > > maximum value for oom.victim before killing any process from a cgroup with a
> > > lesser oom.victim number. Oom killing could be disabled by setting
> > > oom.victim=0.
> >
> > Looking at the patch, I wonder if it is time for user space OOM
> > notifications that were discussed during the containers mini-summit.
> > The idea is to inform user space about OOM's and let user space take
> > action, if no action is taken, the default handler kicks in.
>
> The OLPC folks (Marcelo I believe) posted code for this and I believe
> OLPC is using this functionality internally so that under memory pressure
> (before we actually hit OOM) programs can respond by doing stuff like
> evicting caches.

Confused.

As far as I know, people want the method of flexible cache treating.
but oom seems less flexible than userland notification.

Why do you think notification is bad?

2009-01-27 07:26:23

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

* KOSAKI Motohiro <[email protected]> [2009-01-27 16:02:34]:

> Hi
>
> > > > As Alan Cox suggested/wondered in this thread,
> > > > http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach
> > > > to override the oom killer selection without losing all the benefits of the
> > > > current oom killer heuristics and oom_adj interface.
> > > >
> > > > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> > > > process using the usual badness value but only within the cgroup with the
> > > > maximum value for oom.victim before killing any process from a cgroup with a
> > > > lesser oom.victim number. Oom killing could be disabled by setting
> > > > oom.victim=0.
> > >
> > > Looking at the patch, I wonder if it is time for user space OOM
> > > notifications that were discussed during the containers mini-summit.
> > > The idea is to inform user space about OOM's and let user space take
> > > action, if no action is taken, the default handler kicks in.
> >
> > The OLPC folks (Marcelo I believe) posted code for this and I believe
> > OLPC is using this functionality internally so that under memory pressure
> > (before we actually hit OOM) programs can respond by doing stuff like
> > evicting caches.
>

I did see the patches on linux-mm, but a more generic cgroup patch
would help both cases, in the absence of cgroups, the default cgroup
will contain all tasks and can carry out the handling.

> Confused.
>
> As far as I know, people want the method of flexible cache treating.
> but oom seems less flexible than userland notification.
>
> Why do you think notification is bad?

I did not find Alan's message confusing or stating that notification
was bad, but I might be misreading it.

--
Balbir

2009-01-27 07:42:01

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, KOSAKI Motohiro wrote:

> Confused.
>
> As far as I know, people want the method of flexible cache treating.
> but oom seems less flexible than userland notification.
>
> Why do you think notification is bad?
>

There're a couple of proposals that have been discussed recently that
share some functional behavior.

One is the cgroup oom notifier that allows you to attach a task to wait on
an oom condition for a collection of tasks. That allows userspace to
respond to the condition by droping caches, adding nodes to a cpuset,
elevating memory controller limits, sending a signal, etc. It can also
defer to the kernel oom killer as a last resort.

The other is /dev/mem_notify that allows you to poll() on a device file
and be informed of low memory events. This can include the cgroup oom
notifier behavior when a collection of tasks is completely out of memory,
but can also warn when such a condition may be imminent. I suggested that
this be implemented as a client of cgroups so that different handlers can
be responsible for different aggregates of tasks.

I think the latter is a much more powerful tool and includes all the
behavior of the former. It preserves the oom killer as a last resort for
the kernel and defers all preference killing or lowmem responses to
userspace.

2009-01-27 07:44:54

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

> On Tue, 27 Jan 2009, KOSAKI Motohiro wrote:
>
> > Confused.
> >
> > As far as I know, people want the method of flexible cache treating.
> > but oom seems less flexible than userland notification.
> >
> > Why do you think notification is bad?
> >
>
> There're a couple of proposals that have been discussed recently that
> share some functional behavior.
>
> One is the cgroup oom notifier that allows you to attach a task to wait on
> an oom condition for a collection of tasks. That allows userspace to
> respond to the condition by droping caches, adding nodes to a cpuset,
> elevating memory controller limits, sending a signal, etc. It can also
> defer to the kernel oom killer as a last resort.
>
> The other is /dev/mem_notify that allows you to poll() on a device file
> and be informed of low memory events. This can include the cgroup oom
> notifier behavior when a collection of tasks is completely out of memory,
> but can also warn when such a condition may be imminent. I suggested that
> this be implemented as a client of cgroups so that different handlers can
> be responsible for different aggregates of tasks.
>
> I think the latter is a much more powerful tool and includes all the
> behavior of the former. It preserves the oom killer as a last resort for
> the kernel and defers all preference killing or lowmem responses to
> userspace.

Yup, indeed. :)
honestly, I talked about the same thingk recently "lowmemory android driver not needed?" thread.


2009-01-27 07:52:25

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, KOSAKI Motohiro wrote:

> Yup, indeed. :)
> honestly, I talked about the same thingk recently "lowmemory android driver not needed?" thread.
>

Yeah, I proposed /dev/mem_notify being made as a client of cgroups there
in http://marc.info/?l=linux-kernel&m=123200623628685

How do you replace the oom killer's capability of giving a killed task
access to memory reserves with TIF_MEMDIE in userspace?

2009-01-27 09:31:25

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Mon, Jan 26, 2009 at 11:51:27PM -0800, David Rientjes ([email protected]) wrote:
> Yeah, I proposed /dev/mem_notify being made as a client of cgroups there
> in http://marc.info/?l=linux-kernel&m=123200623628685
>
> How do you replace the oom killer's capability of giving a killed task
> access to memory reserves with TIF_MEMDIE in userspace?

/dev/mem_notify is a great idea, but please do not limit existing
oom-killer in its ability to do the job and do not rely on application's
ability to send a SIGKILL which will not kill tasks in unkillable state
contrary to oom-killer.

--
Evgeniy Polyakov

2009-01-27 09:38:53

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Evgeniy Polyakov wrote:

> /dev/mem_notify is a great idea, but please do not limit existing
> oom-killer in its ability to do the job and do not rely on application's
> ability to send a SIGKILL which will not kill tasks in unkillable state
> contrary to oom-killer.
>

You're missing the point, /dev/mem_notify would notify userspace of lowmem
situations and allow it to respond appropriately in any number of ways
before an oom condition exists.

When the system (or cpuset, memory controller, etc) is oom, userspace can
choose to defer to the oom killer so that it may kill a task that would
most likely lead to future memory freeing with access to memory reserves.

There is no additional oom killer limitation imposed here, nor can the oom
killer kill a task hung in D state any better than userspace.

Thanks.

2009-01-27 10:22:56

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Saturday 24 January 2009 02:14:59 David Rientjes wrote:
> On Fri, 23 Jan 2009, Nikanth Karthikesan wrote:
> > In other instances, It can actually also kill some innocent tasks unless
> > the administrator tunes oom_adj, say something like kvm which would have
> > a huge memory accounted, but might be from a different node altogether.
> > Killing a single vm is killing all of the processes in that OS ;) Don't
> > you think this has to be fixed^Wimproved?
>
> As previously stated, I think the heuristic to penalize tasks for not
> having an intersection with the set of allowable nodes of the oom
> triggering task could be made slightly more severe. That's irrelevant to
> your patch, though.
>

But the heuristic makes it non-deterministic, unlike memcg case. And this
mandates special handling for cpuset constrained OOM conditions in this patch.

> > > That's my objection to the proposal: it doesn't behave appropriately
> > > for both global unconstrained ooms and cpuset-constrained ooms at the
> > > same time.
> >
> > So you are against specifying order when it is a cpuset-constrained oom.
> > Here is a revised version of the patch which adds oom.cpuset_constraint,
> > when set to 1 would disable the ordering imposed by this controller for
> > cpuset constrained ooms! Will this work for you?
>
> No, I don't think it's appropriate to add special exemptions for specific
> subsystems to what should be a generic cgroup. I think it is much more
> powerful to defer these decisions to userspace so each cgroup can attach
> its own handler and implement the necessary decision-making that the
> kernel could never perfectly handle for all possible workloads.
>
> It is trivial to implement the equivalent of this particular change as a
> userspace handler to SIGKILL all tasks in a specific cgroup when the
> cgroup oom handler is woken up at the time of oom. Additionally, it could
> also respond in other ways such as adding a node to a cpuset, killing a
> less important cgroup, elevate a memory controller limit, send a signal to
> your application to release memory, etc.
>
> We also talked about a cgroup /dev/mem_notify device file that you can
> poll() and learn of low memory situations so that appropriate action can
> be taken even in lowmem situations as opposed to simply oom conditions.
>

Userspace also needs to handle the cpuset constrained _almost-oom_'s
specially? I wonder how easily userspace can handle that.

> These types of policy decisions belong in userspace.

Yes, policy decisions will be made in user-space using this oom-controller.
This is just a framework/means to enforce policies. We do not make any
decisions inside the kernel.

But yes, the badness calculation by the oom killer implements some kind of
policy inside the kernel, but I guess it can stay, as this oom-controller lets
user policy over-ride kernel policy. ;-)

Thanks
Nikanth

2009-01-27 10:41:20

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

Hi Evgeniy,

> On Mon, Jan 26, 2009 at 11:51:27PM -0800, David Rientjes ([email protected]) wrote:
> > Yeah, I proposed /dev/mem_notify being made as a client of cgroups there
> > in http://marc.info/?l=linux-kernel&m=123200623628685
> >
> > How do you replace the oom killer's capability of giving a killed task
> > access to memory reserves with TIF_MEMDIE in userspace?
>
> /dev/mem_notify is a great idea, but please do not limit existing
> oom-killer in its ability to do the job and do not rely on application's
> ability to send a SIGKILL which will not kill tasks in unkillable state
> contrary to oom-killer.

I'd like to respect your requiremnt. but I also would like to know
why you like deterministic hierarchy oom than notification.

I think one of problem is, current patch description is a bit poor
and don't describe from administrator view.

Could you please sort the discssion out and explain your requirement detail?
otherwise (I guess) this discussion don't reach people agreement.

I don't like the implementation idea vs another idea discussion.
it often don't make productive discussion.
I'd like to sort out people requrement.
otherwise I can't review the patch fill requirement or not.


2009-01-27 10:53:59

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Nikanth Karthikesan wrote:

> > As previously stated, I think the heuristic to penalize tasks for not
> > having an intersection with the set of allowable nodes of the oom
> > triggering task could be made slightly more severe. That's irrelevant to
> > your patch, though.
> >
>
> But the heuristic makes it non-deterministic, unlike memcg case. And this
> mandates special handling for cpuset constrained OOM conditions in this patch.
>

Dividing a badness score by 8 if a task's set of allowable nodes do not
insect with the oom triggering task's set does not make an otherwise
deterministic algorithm non-deterministic.

I don't understand what you're arguing for here. Are you suggesting that
we should not prefer tasks that intersect the set of allowable nodes?
That makes no sense if the goal is to allow for future memory freeing.

> > We also talked about a cgroup /dev/mem_notify device file that you can
> > poll() and learn of low memory situations so that appropriate action can
> > be taken even in lowmem situations as opposed to simply oom conditions.
> >
>
> Userspace also needs to handle the cpuset constrained _almost-oom_'s
> specially? I wonder how easily userspace can handle that.
>

It handles it very well, cpusets are a client of cgroups and the
/dev/mem_notify extension would be as well. So to handle lowmem
notifications for your cpuset, you would mount both cgroup subsystems at
the same time and then poll() on the mem_notify file. It would be
responsible for the aggregate of tasks that the cgroup represents.

If lowmem notifications are implemented in the reclaim path, this is much
easier for cpusets than for the memory controller, actually, since we
already collect per-node ZVC information.

> > These types of policy decisions belong in userspace.
>
> Yes, policy decisions will be made in user-space using this oom-controller.
> This is just a framework/means to enforce policies. We do not make any
> decisions inside the kernel.
>
> But yes, the badness calculation by the oom killer implements some kind of
> policy inside the kernel, but I guess it can stay, as this oom-controller lets
> user policy over-ride kernel policy. ;-)
>

The goal should not be to override the kernel's choice, because that
decision depends heavily on the type of oom and the state of the machine
at the time. Appropriate changes to the oom killer's heuristics are
always welcome; in my opinion, we should probably penalize tasks that do
not intersect the triggering task's set of allowable nodes more than we
currently do.

I think you'll find that your goals can be accomplished with a mem_notify
cgroup and that it is a much more powerful interface so that your
userspace policy can be better informed, especially if it is aware of
lowmem situations where oom conditions are imminent.

2009-01-27 11:11:11

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tuesday 27 January 2009 16:23:00 David Rientjes wrote:
> On Tue, 27 Jan 2009, Nikanth Karthikesan wrote:
> > > As previously stated, I think the heuristic to penalize tasks for not
> > > having an intersection with the set of allowable nodes of the oom
> > > triggering task could be made slightly more severe. That's irrelevant
> > > to your patch, though.
> >
> > But the heuristic makes it non-deterministic, unlike memcg case. And this
> > mandates special handling for cpuset constrained OOM conditions in this
> > patch.
>
> Dividing a badness score by 8 if a task's set of allowable nodes do not
> insect with the oom triggering task's set does not make an otherwise
> deterministic algorithm non-deterministic.
>
> I don't understand what you're arguing for here. Are you suggesting that
> we should not prefer tasks that intersect the set of allowable nodes?
> That makes no sense if the goal is to allow for future memory freeing.
>

No. Actually I am just wondering, will it be possible to check whether a
particular task has memory allocated or mmaped from this node to avoid killing
an innocent task. I compared with memcg, to say that memcg never kills a task
not related to the memcg constrained oom. Sorry if I was unclear, earlier. If
we do this, oom-controller will not require special handling for cpuset
constrained ooms.

Thanks
Nikanth

2009-01-27 11:23:18

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Nikanth Karthikesan wrote:

> > I don't understand what you're arguing for here. Are you suggesting that
> > we should not prefer tasks that intersect the set of allowable nodes?
> > That makes no sense if the goal is to allow for future memory freeing.
> >
>
> No. Actually I am just wondering, will it be possible to check whether a
> particular task has memory allocated or mmaped from this node to avoid killing
> an innocent task.

That's certainly idealistic, but cannot be done in an inexpensive way that
would scale with the large systems that clients of cpusets typically use.

2009-01-27 11:40:31

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tuesday 27 January 2009 16:51:26 David Rientjes wrote:
> On Tue, 27 Jan 2009, Nikanth Karthikesan wrote:
> > > I don't understand what you're arguing for here. Are you suggesting
> > > that we should not prefer tasks that intersect the set of allowable
> > > nodes? That makes no sense if the goal is to allow for future memory
> > > freeing.
> >
> > No. Actually I am just wondering, will it be possible to check whether a
> > particular task has memory allocated or mmaped from this node to avoid
> > killing an innocent task.
>
> That's certainly idealistic, but cannot be done in an inexpensive way that
> would scale with the large systems that clients of cpusets typically use.

If we kill only the tasks for which cpuset_mems_allowed_intersects() is true
on the first pass and even then if we do not get out of oom, we could go over
again with this expensive check. Using this scheme, could kill more no of
tasks than required, if a task with lots of memory has moved to a different
cpuset. But it should be rare and not that severe compared to killing a
totally innocent task like the current scheme does?!

Thanks
Nikanth

2009-01-27 13:40:56

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

Hi David.

On Tue, Jan 27, 2009 at 01:37:55AM -0800, David Rientjes ([email protected]) wrote:
> > /dev/mem_notify is a great idea, but please do not limit existing
> > oom-killer in its ability to do the job and do not rely on application's
> > ability to send a SIGKILL which will not kill tasks in unkillable state
> > contrary to oom-killer.
> >
>
> You're missing the point, /dev/mem_notify would notify userspace of lowmem
> situations and allow it to respond appropriately in any number of ways
> before an oom condition exists.

Yes, I know.

> When the system (or cpuset, memory controller, etc) is oom, userspace can
> choose to defer to the oom killer so that it may kill a task that would
> most likely lead to future memory freeing with access to memory reserves.
>
> There is no additional oom killer limitation imposed here, nor can the oom
> killer kill a task hung in D state any better than userspace.

Well, oom-killer can, since it drops unkillable state from the process
mask, that may be not enough though, but it tries more than userspace.

My main point was to haev a way to monitor memory usage and that any
process could tune own behaviour according to that information. Which is
not realated to the system oom-killer at all. Thus /dev/mem_notify is
interested first (and only the first) as a memory usage notification
interface and not a way to invoke any kind of 'soft' oom-killer.
Application can do whatever it wants of course including killing itself
or the neighbours, but this should not be forced as a usage policy.

--
Evgeniy Polyakov

2009-01-27 13:46:18

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

Hi.

On Tue, Jan 27, 2009 at 07:40:58PM +0900, KOSAKI Motohiro ([email protected]) wrote:
> I'd like to respect your requiremnt. but I also would like to know
> why you like deterministic hierarchy oom than notification.
>
> I think one of problem is, current patch description is a bit poor
> and don't describe from administrator view.

Notification of the memory state is by no means a great idea.
Any process which cares about the system state can register and make
some decisions based on the memory state. But if it fails to update to
the current situation, the main oom-killer has to enter the scene and
make a progress on the system behaviour.

As I wrote multiple times there may be a quite trivial situation, when
process will not be able to make progress (it will not be able to free
some data even if its memory notification callback is invoked in some
cases), so we just can not rely on that. After all there may be no
processes with given notifications registered, so we should be able to
tune main oom-killer, which is another story compared to the
/dev/mem_notify discussion.

Having some special application which will monitor /dev/mem_notify and
kill processes based on its own hueristics is a good idea, but when it
fails to do its work (or does not exist) system has to have ability to
make a progress and invoke a main oom-killer.

--
Evgeniy Polyakov

2009-01-27 15:41:17

by Balbir Singh

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

* Evgeniy Polyakov <[email protected]> [2009-01-27 16:45:59]:

> Hi.
>
> On Tue, Jan 27, 2009 at 07:40:58PM +0900, KOSAKI Motohiro ([email protected]) wrote:
> > I'd like to respect your requiremnt. but I also would like to know
> > why you like deterministic hierarchy oom than notification.
> >
> > I think one of problem is, current patch description is a bit poor
> > and don't describe from administrator view.
>
> Notification of the memory state is by no means a great idea.
> Any process which cares about the system state can register and make
> some decisions based on the memory state. But if it fails to update to
> the current situation, the main oom-killer has to enter the scene and
> make a progress on the system behaviour.
>
> As I wrote multiple times there may be a quite trivial situation, when
> process will not be able to make progress (it will not be able to free
> some data even if its memory notification callback is invoked in some
> cases), so we just can not rely on that. After all there may be no
> processes with given notifications registered, so we should be able to
> tune main oom-killer, which is another story compared to the
> /dev/mem_notify discussion.
>
> Having some special application which will monitor /dev/mem_notify and
> kill processes based on its own hueristics is a good idea, but when it
> fails to do its work (or does not exist) system has to have ability to
> make a progress and invoke a main oom-killer.
>

The last part is what we've discussed in the mini-summit. There should
be OOM kill notification to user space and if that fails let the kernel
invoke the OOM killer. The exact interface for notification is not
interesting, one could use netlink if that works well.

--
Balbir

2009-01-27 20:31:19

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Nikanth Karthikesan wrote:

> > That's certainly idealistic, but cannot be done in an inexpensive way that
> > would scale with the large systems that clients of cpusets typically use.
>
> If we kill only the tasks for which cpuset_mems_allowed_intersects() is true
> on the first pass and even then if we do not get out of oom, we could go over
> again with this expensive check.

The oom killer has no memory of previous kills, so it's not possible to
determine if there've been a series of recent needless ones. Subsequent
oom conditions should still check for intersection and, since it's only a
heuristic, a large memory-hogging task will eventually be killed if there
are no tasks remaining with such an intersection.

I don't know how you're planning on mapping large memory allocations on
nodes of interest back to specific tasks, however. Do you have a
proposal?

> Using this scheme, could kill more no of
> tasks than required, if a task with lots of memory has moved to a different
> cpuset.

That's rare, since cpusets are used for NUMA optimizations and a set of
cpus has a static affinity to certain memory. It could happen if a
cpuset's set of allowable nodes is made to be smaller, but that seems like
it would trigger the oom in the first place and would encourage killing
tasks with an intersection.

2009-01-27 20:38:48

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Evgeniy Polyakov wrote:

> > There is no additional oom killer limitation imposed here, nor can the oom
> > killer kill a task hung in D state any better than userspace.
>
> Well, oom-killer can, since it drops unkillable state from the process
> mask, that may be not enough though, but it tries more than userspace.
>

The only thing it does is send a SIGKILL and gives the thread access to
memory reserves with TIF_MEMDIE, it doesn't drop any unkillable state. If
its victim is hung in D state and the memory reserves do not allow it to
return to being runnable, this task will not die and the oom killer would
livelock unless given another target.

> My main point was to haev a way to monitor memory usage and that any
> process could tune own behaviour according to that information. Which is
> not realated to the system oom-killer at all. Thus /dev/mem_notify is
> interested first (and only the first) as a memory usage notification
> interface and not a way to invoke any kind of 'soft' oom-killer.

It's a way to prevent invoking the kernel oom killer by allowing userspace
notification of events where methods such as droping caches, elevating
limits, adding nodes, sending signals, etc, can prevent such a problem.
When the system (or cgroup) is completely oom, it can also issue SIGKILLs
that will free some memory and preempt the oom killer from acting.

I think there might be some confusion about my proposal for extending
/dev/mem_notify. Not only should it notify of certain low memory events,
but it should also allow userspace notification of oom events, just like
the cgroup oom notifier patch allowed. Instead of attaching a task to a
cgroup file in that case, however, this would simply be the responsibility
of a task that has set up a poll() on the cgroup's mem_notify file. A
configurable delay could be imposed so page allocation attempts simply
loop while the userspace handler responds and then only invoke the oom
killer when absolutely necessary.

> Application can do whatever it wants of course including killing itself
> or the neighbours, but this should not be forced as a usage policy.
>

If preference killing is your goal, then userspace can do it with the
/dev/mem_notify functionality.

2009-01-27 20:43:19

by David Rientjes

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, 27 Jan 2009, Evgeniy Polyakov wrote:

> Having some special application which will monitor /dev/mem_notify and
> kill processes based on its own hueristics is a good idea, but when it
> fails to do its work (or does not exist) system has to have ability to
> make a progress and invoke a main oom-killer.
>

Agreed, very similiar to the cgroup oom notifier patch that invokes the
oom killer if there are no attached tasks waiting to handle the situation.

In this case, it would be a configurable delay to allow userspace to act
in response to oom conditions before invoking the kernel oom killer. So
instead of thinking of this as a userspace replacement for the oom killer,
it simply preempts it if userspace can provide more memory, including the
possibility of killing tasks itself.

2009-01-27 21:51:34

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, Jan 27, 2009 at 12:37:21PM -0800, David Rientjes ([email protected]) wrote:
> > Well, oom-killer can, since it drops unkillable state from the process
> > mask, that may be not enough though, but it tries more than userspace.
> >
>
> The only thing it does is send a SIGKILL and gives the thread access to
> memory reserves with TIF_MEMDIE, it doesn't drop any unkillable state. If

There is a small difference between force_sig_info() and usual
send_sinal() used by kill.

> its victim is hung in D state and the memory reserves do not allow it to
> return to being runnable, this task will not die and the oom killer would
> livelock unless given another target.

D-states are different. In the current tree we even have
page_lock_killable(), so it depends.

> > My main point was to haev a way to monitor memory usage and that any
> > process could tune own behaviour according to that information. Which is
> > not realated to the system oom-killer at all. Thus /dev/mem_notify is
> > interested first (and only the first) as a memory usage notification
> > interface and not a way to invoke any kind of 'soft' oom-killer.
>
> It's a way to prevent invoking the kernel oom killer by allowing userspace
> notification of events where methods such as droping caches, elevating
> limits, adding nodes, sending signals, etc, can prevent such a problem.
> When the system (or cgroup) is completely oom, it can also issue SIGKILLs
> that will free some memory and preempt the oom killer from acting.
>
> I think there might be some confusion about my proposal for extending
> /dev/mem_notify. Not only should it notify of certain low memory events,
> but it should also allow userspace notification of oom events, just like
> the cgroup oom notifier patch allowed. Instead of attaching a task to a
> cgroup file in that case, however, this would simply be the responsibility
> of a task that has set up a poll() on the cgroup's mem_notify file. A
> configurable delay could be imposed so page allocation attempts simply
> loop while the userspace handler responds and then only invoke the oom
> killer when absolutely necessary.

I have really no objections against this and extending oom-killer to
allow to wait a bit in the allocation path before userspace makes some
progress. But do not drop existing oom-killer (i.e. its ability to kill
processes) in favour of this new feature. Let's have both and if
extension failed for some reason, old oom-killer will do the things.

--
Evgeniy Polyakov

2009-01-27 21:54:28

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, Jan 27, 2009 at 09:10:53PM +0530, Balbir Singh ([email protected]) wrote:
> > Having some special application which will monitor /dev/mem_notify and
> > kill processes based on its own hueristics is a good idea, but when it
> > fails to do its work (or does not exist) system has to have ability to
> > make a progress and invoke a main oom-killer.
>
> The last part is what we've discussed in the mini-summit. There should
> be OOM kill notification to user space and if that fails let the kernel
> invoke the OOM killer. The exact interface for notification is not
> interesting, one could use netlink if that works well.

Yes, that's exactly what I would like to see.
Btw, netlink will not be a good idea, since it requires additional (and
quite big) allocation. I believe just reading the char device (or maybe
having a syscall) is enough, but its up to the implementation.

--
Evgeniy Polyakov

2009-01-27 21:56:05

by Evgeniy Polyakov

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Tue, Jan 27, 2009 at 12:41:03PM -0800, David Rientjes ([email protected]) wrote:
> > Having some special application which will monitor /dev/mem_notify and
> > kill processes based on its own hueristics is a good idea, but when it
> > fails to do its work (or does not exist) system has to have ability to
> > make a progress and invoke a main oom-killer.
>
> Agreed, very similiar to the cgroup oom notifier patch that invokes the
> oom killer if there are no attached tasks waiting to handle the situation.
>
> In this case, it would be a configurable delay to allow userspace to act
> in response to oom conditions before invoking the kernel oom killer. So
> instead of thinking of this as a userspace replacement for the oom killer,
> it simply preempts it if userspace can provide more memory, including the
> possibility of killing tasks itself.

How different may look idea expressed by the different phrases and cold
heads :)

--
Evgeniy Polyakov

2009-01-27 23:55:58

by Paul Menage

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Thu, Jan 22, 2009 at 5:21 AM, Evgeniy Polyakov <[email protected]> wrote:
> Having userspace to decide which task to kill may not work in some cases
> at all (when task is swapped and we need to kill someone to get the mem
> to swap out the task, which will make that decision).

That's true in the case of a global OOM. In the case of a local OOM
(caused by memory limits applied via the cgroup memory controller, or
NUMA affinity enforcement applied by cpusets) the userspace handler
can be in a different domain which isn't OOM, and be quite capable of
figuring out who to kill. In our particular use case, it can happen
that a high-priority job hits its memory limits and triggers an OOM,
which causes the system controller daemon to kill some lower-priority
job and reassign some memory from that now-dead low-priority job (and
thus prevent the OOM from killing any process in the original cgroup).
This is something that would be very hard to express via kernel
policy.

Paul

2009-01-28 01:01:10

by Paul Menage

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

Hi Nikanth,

On Fri, Jan 23, 2009 at 6:56 AM, Nikanth Karthikesan <[email protected]> wrote:
>
> From: Nikanth Karthikesan <[email protected]>
>
> Cgroup based OOM killer controller
>
> Signed-off-by: Nikanth Karthikesan <[email protected]>
>
> ---
>
> This is a container group based approach to override the oom killer selection
> without losing all the benefits of the current oom killer heuristics and
> oom_adj interface.

The basic functionality looks useful.

But before we add an OOM subsystem and commit to an API that has to be
supported forever, I think it would be good to have an overall design
for what kinds of things we want to be able to do regarding cgroups
and OOM killing.

Specifying a per-cgroup priority is part of the solution, and is
useful for simple cases. Some kind of userspace notification is also
useful.

The notification system that David/Ying posted has worked pretty well
for us at Google - it's allowed us to use cpusets and fake numa to
provide hard memory controls and guarantees for jobs, while avoiding
having jobs getting killed when they expand faster than we expect. But
we also acknowledge that it's a bit of a hack, and it would be nice to
come up with something more generally acceptable for a real
submission.

>
> It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the
> process using the usual badness value but only within the cgroup with the
> maximum value for oom.victim before killing any process from a cgroup with a
> lesser oom.victim number. Oom killing could be disabled by setting
> oom.victim=0.

"priority" might be a better term than "victim".

>
> CPUSET constrained OOM:
> Also the tunable oom.cpuset_constrained when enabled, would disable the
> ordering imposed by this controller for cpuset constrained OOMs.
>
> diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt
> new file mode 100644
> index 0000000..772fb41
> --- /dev/null
> +++ b/Documentation/cgroups/oom.txt
> @@ -0,0 +1,34 @@
> +OOM Killer controller
> +--- ------ ----------
> +
> +The OOM killer kills the process based on a set of heuristics such that only

Might be worth adding "theoretically" in this sentence :-)

> do_posix_clock_monotonic_gettime(&uptime);
> @@ -257,10 +262,30 @@ static struct task_struct *select_bad_process(unsigned
> long *ppoints,
> continue;
>
> points = badness(p, uptime.tv_sec);
> +#ifdef CONFIG_CGROUP_OOM
> + taskvictim = (container_of(p->cgroups->subsys[oom_subsys_id],
> + struct oom_cgroup, css))->victim;

Firstly, this ought to be using the task_subsys_state() function to
ensure the appropriate rcu_dereference() calls.

Secondly, is it safe? I'm not sure if we're in an RCU section in this
case, and we certainly haven't called task_lock(p) or cgroup_lock().
You should surround this with rcu_read_lock()/rcu_read_unlock().

And thirdly, it would be better to move the #ifdef to the header file,
and provide dummy functions that return 0 for the kill priority if
CONFIG_CGROUP_OOM isn't defined.

> + honour_cpuset_constraint = *(container_of(p->cgroups-
>>subsys[oom_subsys_id],
> + struct oom_cgroup, css))-
>>cpuset_constraint;

I think that putting this kind of inter-subsystem dependency in is a
bad idea. If you want to control whether the OOM killer treats cpusets
specially, perhaps that flag should be put in cpusets?

> +
> + if (taskvictim > chosenvictim ||
> + (((taskvictim == chosenvictim) ||
> + (cpuset_constrained && honour_cpuset_constraint))
> + && points > *ppoints) ||
> + (taskvictim && !chosen)) {

This could do with more comments or maybe breaking up into simpler conditions.


> + if (cont->parent == NULL) {
> + oom_css->victim = 1;

Any reason to default to 1 rather than 0?

> + oom_css->cpuset_constraint =
> + kzalloc(sizeof(*oom_css->cpuset_constraint), GFP_KERNEL);
> + *oom_css->cpuset_constraint = false;
> + } else {
> + parent = oom_css_from_cgroup(cont->parent);
> + oom_css->victim = parent->victim;
> + oom_css->cpuset_constraint = parent->cpuset_constraint;
> + }

So there's a single cpuset_constraint shared by all cgroups? Isn't
that just a global variable then?

> +
> +static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft,
> + u64 val)
> +{
> +
> + cgroup_lock();

This isn't really doing much, since you don't synchronize on the read
side (either the file handler or in the OOM killer itself). It might
be better to just make the value an atomic_t and avoid taking
cgroup_lock() here.

Should we enforce any constraint that a cgroup can never have a lower
kill priority than its parent? Or a separate "min child priority"
value, or just make the cgroup's priority be the max of any in its
path to the root? That would allow you to safely delegate OOM priority
control to sub cgroups while still controlling relative priorities for
each subtree.

> +static int oom_cpuset_write(struct cgroup *cont, struct cftype *cft,
> + const char *buffer)
> +{
> + if (buffer[0] == '1' && buffer[1] == 0)
> + *(oom_css_from_cgroup(cont))->cpuset_constraint = true;
> + else if (buffer[0] == '0' && buffer[1] == 0)
> + *(oom_css_from_cgroup(cont))->cpuset_constraint = false;
> + else
> + return -EINVAL;
> + return 0;
> +}

This can be a u64 write handler that just complains if its input isn't 0 or 1.

> +static struct cftype oom_cgroup_files[] = {
> + {
> + .name = "victim",
> + .read_u64 = oom_victim_read,
> + .write_u64 = oom_victim_write,
> + },
> +};
> +
> +static struct cftype oom_cgroup_root_files[] = {
> + {
> + .name = "victim",
> + .read_u64 = oom_victim_read,
> + .write_u64 = oom_victim_write,
> + },

Don't duplicate here - just have disjoint sets of files, and call
cgroup_add_files(oom_cgroup_root_files) in addition to the regular
files if it's the root. (Although as I mentioned above, I don't really
think this is the right place for the cpuset_constraint file)

Paul

2009-01-29 15:51:14

by Nikanth Karthikesan

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Cgroup based OOM killer controller

On Wednesday 28 January 2009 06:30:42 Paul Menage wrote:
> Hi Nikanth,
>
> On Fri, Jan 23, 2009 at 6:56 AM, Nikanth Karthikesan <[email protected]>
wrote:
> > From: Nikanth Karthikesan <[email protected]>
> >
> > Cgroup based OOM killer controller
> >
> > Signed-off-by: Nikanth Karthikesan <[email protected]>
> >
> > ---
> >
> > This is a container group based approach to override the oom killer
> > selection without losing all the benefits of the current oom killer
> > heuristics and oom_adj interface.
>
> The basic functionality looks useful.
>

Thanks.

> But before we add an OOM subsystem and commit to an API that has to be
> supported forever, I think it would be good to have an overall design
> for what kinds of things we want to be able to do regarding cgroups
> and OOM killing.
>
> Specifying a per-cgroup priority is part of the solution, and is
> useful for simple cases. Some kind of userspace notification is also
> useful.
>

Yes, very much.

> The notification system that David/Ying posted has worked pretty well
> for us at Google - it's allowed us to use cpusets and fake numa to
> provide hard memory controls and guarantees for jobs, while avoiding
> having jobs getting killed when they expand faster than we expect. But
> we also acknowledge that it's a bit of a hack, and it would be nice to
> come up with something more generally acceptable for a real
> submission.
>
> > It adds a tunable oom.victim to the oom cgroup. The oom killer will kill
> > the process using the usual badness value but only within the cgroup with
> > the maximum value for oom.victim before killing any process from a cgroup
> > with a lesser oom.victim number. Oom killing could be disabled by setting
> > oom.victim=0.
>
> "priority" might be a better term than "victim".
>

Agreed.

> > CPUSET constrained OOM:
> > Also the tunable oom.cpuset_constrained when enabled, would disable the
> > ordering imposed by this controller for cpuset constrained OOMs.
> >
> > diff --git a/Documentation/cgroups/oom.txt
> > b/Documentation/cgroups/oom.txt new file mode 100644
> > index 0000000..772fb41
> > --- /dev/null
> > +++ b/Documentation/cgroups/oom.txt
> > @@ -0,0 +1,34 @@
> > +OOM Killer controller
> > +--- ------ ----------
> > +
> > +The OOM killer kills the process based on a set of heuristics such that
> > only
>
> Might be worth adding "theoretically" in this sentence :-)
>
> > do_posix_clock_monotonic_gettime(&uptime);
> > @@ -257,10 +262,30 @@ static struct task_struct
> > *select_bad_process(unsigned long *ppoints,
> > continue;
> >
> > points = badness(p, uptime.tv_sec);
> > +#ifdef CONFIG_CGROUP_OOM
> > + taskvictim =
> > (container_of(p->cgroups->subsys[oom_subsys_id], +
> > struct oom_cgroup, css))->victim;
>
> Firstly, this ought to be using the task_subsys_state() function to
> ensure the appropriate rcu_dereference() calls.
>

Ok.

> Secondly, is it safe? I'm not sure if we're in an RCU section in this
> case, and we certainly haven't called task_lock(p) or cgroup_lock().
> You should surround this with rcu_read_lock()/rcu_read_unlock().
>

Ok.

> And thirdly, it would be better to move the #ifdef to the header file,
> and provide dummy functions that return 0 for the kill priority if
> CONFIG_CGROUP_OOM isn't defined.
>

Ok. As this patch uses 0 to disable oom_killing completely, the dummy function
should return 1 instead of zero. It should be documented more clearly.

> > + honour_cpuset_constraint = *(container_of(p->cgroups-
> >
> >>subsys[oom_subsys_id],
> >
> > + struct oom_cgroup,
> > css))-
> >
> >>cpuset_constraint;
>
> I think that putting this kind of inter-subsystem dependency in is a
> bad idea. If you want to control whether the OOM killer treats cpusets
> specially, perhaps that flag should be put in cpusets?
>

But then won't it add a special variable in cpusets for oom-controller?

> > +
> > + if (taskvictim > chosenvictim ||
> > + (((taskvictim == chosenvictim) ||
> > + (cpuset_constrained &&
> > honour_cpuset_constraint)) + && points >
> > *ppoints) ||
> > + (taskvictim && !chosen)) {
>
> This could do with more comments or maybe breaking up into simpler
> conditions.
>

Ok.

> > + if (cont->parent == NULL) {
> > + oom_css->victim = 1;
>
> Any reason to default to 1 rather than 0?
>

0 disables oom killing completely.

> > + oom_css->cpuset_constraint =
> > + kzalloc(sizeof(*oom_css->cpuset_constraint),
> > GFP_KERNEL); + *oom_css->cpuset_constraint = false;
> > + } else {
> > + parent = oom_css_from_cgroup(cont->parent);
> > + oom_css->victim = parent->victim;
> > + oom_css->cpuset_constraint = parent->cpuset_constraint;
> > + }
>
> So there's a single cpuset_constraint shared by all cgroups? Isn't
> that just a global variable then?
>

Yes, it should be a global variable.

> > +
> > +static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft,
> > + u64 val)
> > +{
> > +
> > + cgroup_lock();
>
> This isn't really doing much, since you don't synchronize on the read
> side (either the file handler or in the OOM killer itself). It might
> be better to just make the value an atomic_t and avoid taking
> cgroup_lock() here.
>

Yes.

> Should we enforce any constraint that a cgroup can never have a lower
> kill priority than its parent? Or a separate "min child priority"
> value, or just make the cgroup's priority be the max of any in its
> path to the root? That would allow you to safely delegate OOM priority
> control to sub cgroups while still controlling relative priorities for
> each subtree.
>

Setting priority to be the maximum of any in its path seems better to me. It
should make it easier to handle a group of cgroups.

> > +static int oom_cpuset_write(struct cgroup *cont, struct cftype *cft,
> > + const char *buffer)
> > +{
> > + if (buffer[0] == '1' && buffer[1] == 0)
> > + *(oom_css_from_cgroup(cont))->cpuset_constraint = true;
> > + else if (buffer[0] == '0' && buffer[1] == 0)
> > + *(oom_css_from_cgroup(cont))->cpuset_constraint = false;
> > + else
> > + return -EINVAL;
> > + return 0;
> > +}
>
> This can be a u64 write handler that just complains if its input isn't 0 or
> 1.
>

Yes, that would be cleaner.

> > +static struct cftype oom_cgroup_files[] = {
> > + {
> > + .name = "victim",
> > + .read_u64 = oom_victim_read,
> > + .write_u64 = oom_victim_write,
> > + },
> > +};
> > +
> > +static struct cftype oom_cgroup_root_files[] = {
> > + {
> > + .name = "victim",
> > + .read_u64 = oom_victim_read,
> > + .write_u64 = oom_victim_write,
> > + },
>
> Don't duplicate here - just have disjoint sets of files, and call
> cgroup_add_files(oom_cgroup_root_files) in addition to the regular
> files if it's the root. (Although as I mentioned above, I don't really
> think this is the right place for the cpuset_constraint file)
>

Ok.

Thanks for the detailed review. I have attached the patch with your comments
incorporated. There is a read-only oom.effective_priority added which is
computed as the maximum oom.priority along its path.

Thanks
Nikanth

From: Nikanth Karthikesan <[email protected]>

Cgroup based OOM killer controller

Signed-off-by: Nikanth Karthikesan <[email protected]>

---

This is a container group based approach to override the oom killer selection
without losing all the benefits of the current oom killer heuristics and
oom_adj interface. This controller helps in specifying a strict order between
tasks that can be killed during a oom.

It adds a tunable oom.priority to the oom cgroup. The oom killer will kill the
process using the usual badness value but only within the cgroup with the
maximum value for oom.effective_priority before killing any process from a
cgroup with a lesser oom.effective_priority number. The oom.effective_priority
is calculated as the maximum oom.priority along its path. Oom killing could be
disabled for a cgroup by setting oom.effective_priority=0.

diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt
new file mode 100644
index 0000000..5ef34db
--- /dev/null
+++ b/Documentation/cgroups/oom.txt
@@ -0,0 +1,36 @@
+OOM Killer controller
+--- ------ ----------
+
+The OOM killer kills the process based on a set of heuristics such that only
+minimum amount of work done will be lost, a large amount of memory would be
+recovered and minimum no of processes are killed.
+
+The user can adjust the score used to select the processes to be killed using
+/proc/<pid>/oom_adj. Giving it a high score will increase the likelihood of
+this process being killed by the oom-killer. Valid values are in the range
+-16 to +15, plus the special value -17, which disables oom-killing altogether
+for that process.
+
+But it is very difficult to suggest an order among tasks to be killed during
+Out Of Memory situation. The OOM Killer controller aids in doing that.
+
+USAGE
+-----
+
+Mount the oom controller by passing 'oom' when mounting cgroups. Echo
+a value in oom.priority file to change the order. The oom.effective_priority
+is calculated as the highest oom.priority along its path. The oom killer
would
+kill all the processes in a cgroup with a higher oom.effective_priority
before
+killing a process in a cgroup with lower oom.effective_priority value. Among
+those tasks with same oom.effective_priority value, the usual badness
+heuristics would be applied. The /proc/<pid>/oom_adj still helps adjusting
the
+oom killer score. Also having oom.effective_priority = 0 would disable oom
+killing for the tasks in that cgroup.
+
+Note: If this is used without proper consideration, innocent processes may
+get killed unnecesarily.
+
+CPUSET constrained OOM:
+Setting oom.cpuset_constraint=1 would disable the ordering during a cpuset
+constrained oom. Setting oom.cpuset_constraint=0 would not distinguish
+between a cpuset constrained oom and system wide oom.
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..6944f99 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -59,4 +59,8 @@ SUBSYS(freezer)
SUBSYS(net_cls)
#endif

+#ifdef CONFIG_CGROUP_OOM
+SUBSYS(oom)
+#endif
+
/* */
diff --git a/include/linux/oomcontrol.h b/include/linux/oomcontrol.h
new file mode 100644
index 0000000..8072d7a
--- /dev/null
+++ b/include/linux/oomcontrol.h
@@ -0,0 +1,35 @@
+#ifndef _LINUX_OOMCONTROL_H
+#define _LINUX_OOMCONTROL_H
+
+#ifdef CONFIG_CGROUP_OOM
+
+struct oom_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * the order to be victimized for this group
+ */
+ atomic_t priority;
+
+ /*
+ * the maximum priority along the path from root
+ */
+ atomic_t effective_priority;
+
+};
+
+/*
+ * disable during cpuset constrained oom
+ */
+extern atomic_t honour_cpuset_constraint;
+
+u64 task_oom_priority(struct task_struct *p);
+
+#else
+
+#define task_oom_priority(p) (1)
+
+static atomic_t honour_cpuset_constraint; /* unused */
+
+#endif
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 2af8382..99ed0de 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -354,6 +354,15 @@ config CGROUP_DEBUG

Say N if unsure.

+config CGROUP_OOM
+ bool "Oom cgroup subsystem"
+ depends on CGROUPS
+ help
+ This provides a cgroup subsystem which aids controlling
+ the order in which tasks whould be killed during
+ out of memory situations.
+
+
config CGROUP_NS
bool "Namespace cgroup subsystem"
depends on CGROUPS
diff --git a/mm/Makefile b/mm/Makefile
index 72255be..a5d7222 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,3 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_OOM) += oomcontrol.o
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba050..6851da3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
+#include <linux/oomcontrol.h>
#include <linux/security.h>

int sysctl_panic_on_oom;
@@ -200,11 +201,13 @@ static inline enum oom_constraint
constrained_alloc(struct zonelist *zonelist,
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned long *ppoints,
- struct mem_cgroup *mem)
+ struct mem_cgroup *mem, int cpuset_constrained)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+ u64 chosenpriority = 1, taskpriority;
+
*ppoints = 0;

do_posix_clock_monotonic_gettime(&uptime);
@@ -257,10 +260,35 @@ static struct task_struct *select_bad_process(unsigned
long *ppoints,
continue;

points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
+
+ taskpriority = task_oom_priority(p);
+
+ /*
+ * select this task if
+ * 1. It has higher oom.priority than the previously selected
+ * task, or
+ * 2. It has the same priority as previously selected task but
+ * higher badness score, or
+ * 3. If this is the first task to be considered and it is not
+ * protected from oom killer by setting priority as zero, or
+ * 4. If this is a cpuset constrained oom and
+ * honour_cpuset_constraint is set
+ */
+ if (taskpriority > chosenpriority ||
+
+ (((taskpriority == chosenpriority) ||
+ (cpuset_constrained &&
+ atomic_read(&honour_cpuset_constraint)))
+ && points > *ppoints) ||
+
+ (taskpriority && !chosen)) {
+
chosen = p;
*ppoints = points;
+ chosenpriority = taskpriority;
+
}
+
} while_each_thread(g, p);

return chosen;
@@ -431,7 +459,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem,
gfp_t gfp_mask)

read_lock(&tasklist_lock);
retry:
- p = select_bad_process(&points, mem);
+ p = select_bad_process(&points, mem, 0); /* not cpuset constrained */
if (PTR_ERR(p) == -1UL)
goto out;

@@ -513,7 +541,7 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t
gfp_mask)
/*
* Must be called with tasklist_lock held for read.
*/
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static void __out_of_memory(gfp_t gfp_mask, int order, int
cpuset_constrained)
{
if (sysctl_oom_kill_allocating_task) {
oom_kill_process(current, gfp_mask, order, 0, NULL,
@@ -528,7 +556,7 @@ retry:
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
- p = select_bad_process(&points, NULL);
+ p = select_bad_process(&points, NULL, cpuset_constrained);

if (PTR_ERR(p) == -1UL)
return;
@@ -569,7 +597,8 @@ void pagefault_out_of_memory(void)
panic("out of memory from page fault. panic_on_oom is selected.\n");

read_lock(&tasklist_lock);
- __out_of_memory(0, 0); /* unknown gfp_mask and order */
+ /* unknown gfp_mask and order and not cpuset constrained */
+ __out_of_memory(0, 0, 0);
read_unlock(&tasklist_lock);

/*
@@ -623,7 +652,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t
gfp_mask, int order)
panic("out of memory. panic_on_oom is selected\n");
/* Fall-through */
case CONSTRAINT_CPUSET:
- __out_of_memory(gfp_mask, order);
+ __out_of_memory(gfp_mask, order, 1);
break;
}

diff --git a/mm/oomcontrol.c b/mm/oomcontrol.c
new file mode 100644
index 0000000..d572b1f
--- /dev/null
+++ b/mm/oomcontrol.c
@@ -0,0 +1,294 @@
+/*
+ * kernel/cgroup_oom.c - oom handler cgroup.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/oomcontrol.h>
+#include <asm/atomic.h>
+
+atomic_t honour_cpuset_constraint;
+
+/*
+ * Helper to retrieve oom controller data from cgroup
+ */
+static struct oom_cgroup *oom_css_from_cgroup(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp,
+ oom_subsys_id), struct oom_cgroup,
+ css);
+}
+
+u64 task_oom_priority(struct task_struct *p)
+{
+ rcu_read_lock();
+ return atomic_read(&(container_of(task_subsys_state(p,oom_subsys_id),
+ struct oom_cgroup, css))->effective_priority);
+ rcu_read_unlock();
+}
+
+static struct cgroup_subsys_state *oom_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct oom_cgroup *oom_css = kzalloc(sizeof(*oom_css), GFP_KERNEL);
+ struct oom_cgroup *parent;
+ u64 parent_priority, parent_effective_priority;
+
+ if (!oom_css)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * if root last/only group to be victimized
+ * else inherit parents value
+ */
+ if (cont->parent == NULL) {
+ atomic_set(&oom_css->priority, 1);
+ atomic_set(&oom_css->effective_priority, 1);
+ atomic_set(&honour_cpuset_constraint, 0);
+ } else {
+ parent = oom_css_from_cgroup(cont->parent);
+ parent_priority = atomic_read(&parent->priority);
+ parent_effective_priority =
+ atomic_read(&parent->effective_priority);
+ atomic_set(&oom_css->priority, parent_priority);
+ atomic_set(&oom_css->effective_priority,
+ parent_effective_priority);
+ }
+
+ return &oom_css->css;
+}
+
+static void oom_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[oom_subsys_id]);
+}
+
+static void increase_effective_priority(struct cgroup *cgrp, u64 val)
+{
+ struct cgroup *curr;
+ struct oom_cgroup *oom_css;
+
+ atomic_set( &(oom_css_from_cgroup(cgrp))->effective_priority, val);
+
+ mutex_lock(&oom_subsys.hierarchy_mutex);
+
+ /*
+ * DFS
+ */
+ if (!list_empty(&cgrp->children))
+ curr = list_first_entry(&cgrp->children,
+ struct cgroup, sibling);
+ else
+ goto out;
+
+visit_children:
+ oom_css = oom_css_from_cgroup(curr);
+ if (atomic_read(&oom_css->effective_priority) < val)
+ atomic_set(&oom_css->effective_priority, val);
+
+ if (!list_empty(&curr->children)) {
+ curr = list_first_entry(&curr->children,
+ struct cgroup, sibling);
+ goto visit_children;
+ } else {
+visit_siblings:
+ if (curr == 0 || cgrp == curr) goto out;
+
+ if (curr->sibling.next != &curr->parent->children) {
+ curr = list_entry(curr->sibling.next,
+ struct cgroup, sibling);
+ goto visit_children;
+ } else {
+ curr = curr->parent;
+ goto visit_siblings;
+ }
+ }
+out:
+ mutex_unlock(&oom_subsys.hierarchy_mutex);
+
+}
+
+static void decrease_effective_priority(struct cgroup *cgrp, u64 val)
+{
+ struct cgroup *curr;
+ u64 priority, effective_priority;
+
+
+ effective_priority = val;
+
+ atomic_set(&oom_css_from_cgroup(cgrp)->effective_priority,
+ effective_priority);
+
+ mutex_lock(&oom_subsys.hierarchy_mutex);
+
+ /*
+ * DFS
+ */
+ if (!list_empty(&cgrp->children))
+ curr = list_first_entry(&cgrp->children,
+ struct cgroup, sibling);
+ else
+ goto out;
+
+visit_children:
+ priority = atomic_read(&oom_css_from_cgroup(curr)->priority);
+
+ if (priority > effective_priority) {
+ atomic_set(&oom_css_from_cgroup(curr)->
+ effective_priority, priority);
+ effective_priority = priority;
+ } else
+ atomic_set(&oom_css_from_cgroup(curr)->
+ effective_priority,effective_priority);
+
+ if (!list_empty(&curr->children)) {
+ curr = list_first_entry(&curr->children,
+ struct cgroup, sibling);
+ goto visit_children;
+ } else {
+visit_siblings:
+ if (curr == 0 || cgrp == curr)
+ goto out;
+
+ if (curr->parent)
+ effective_priority =
+ atomic_read(&oom_css_from_cgroup(
+ curr->parent)->effective_priority);
+ else
+ effective_priority = val;
+
+ if (curr->sibling.next != &curr->parent->children) {
+ curr = list_entry(curr->sibling.next,
+ struct cgroup, sibling);
+ goto visit_children;
+ } else {
+ curr = curr->parent;
+ goto visit_siblings;
+ }
+ }
+out:
+
+ mutex_unlock(&oom_subsys.hierarchy_mutex);
+
+}
+
+static int oom_priority_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+ u64 effective_priority;
+ u64 old_priority;
+ u64 parent_effective_priority = 0;
+
+ old_priority = atomic_read(&(oom_css_from_cgroup(cgrp))->priority);
+ atomic_set(&(oom_css_from_cgroup(cgrp))->priority, val);
+
+ effective_priority = atomic_read(
+ &(oom_css_from_cgroup(cgrp))->effective_priority);
+
+ /*
+ * propagate new effective_priority to sub cgroups
+ */
+ if (val > effective_priority)
+ increase_effective_priority(cgrp, val);
+ else if (effective_priority == old_priority &&
+ val < effective_priority) {
+ struct oom_cgroup *oom_css = NULL;
+ if (cgrp->parent)
+ oom_css = oom_css_from_cgroup(cgrp->parent);
+ else
+ oom_css = oom_css_from_cgroup(cgrp);
+
+ if (cgrp->parent)
+ parent_effective_priority =
+ atomic_read(&oom_css->effective_priority);
+
+ if (cgrp->parent == NULL ||
+ parent_effective_priority < effective_priority) {
+ /*
+ * set effective_priority to max of parents effective and
+ * new priority
+ */
+ if (cgrp->parent == NULL || effective_priority < val
+ || parent_effective_priority < val)
+ effective_priority = val;
+ else
+ effective_priority = parent_effective_priority;
+
+ decrease_effective_priority(cgrp, effective_priority);
+
+ }
+ }
+ return 0;
+}
+
+static u64 oom_effective_priority_read(struct cgroup *cgrp, struct cftype
*cft)
+{
+ u64 priority = atomic_read(&(oom_css_from_cgroup(cgrp))-
>effective_priority);
+
+ return priority;
+}
+
+static u64 oom_priority_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ u64 priority = atomic_read(&(oom_css_from_cgroup(cgrp))->priority);
+
+ return priority;
+}
+
+static int oom_cpuset_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+ if (val > 1)
+ return -EINVAL;
+ atomic_set(&honour_cpuset_constraint, val);
+ return 0;
+}
+
+static u64 oom_cpuset_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return atomic_read(&honour_cpuset_constraint);
+}
+
+static struct cftype oom_cgroup_files[] = {
+ {
+ .name = "priority",
+ .read_u64 = oom_priority_read,
+ .write_u64 = oom_priority_write,
+ },
+ {
+ .name = "effective_priority",
+ .read_u64 = oom_effective_priority_read,
+ },
+};
+
+static struct cftype oom_cgroup_root_only_files[] = {
+ {
+ .name = "cpuset_constraint",
+ .read_u64 = oom_cpuset_read,
+ .write_u64 = oom_cpuset_write,
+ },
+};
+
+static int oom_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int ret;
+
+ ret = cgroup_add_files(cont, ss, oom_cgroup_files,
+ ARRAY_SIZE(oom_cgroup_files));
+ if (!ret && cont->parent == NULL) {
+ ret = cgroup_add_files(cont, ss, oom_cgroup_root_only_files,
+ ARRAY_SIZE(oom_cgroup_root_only_files));
+ }
+
+ return ret;
+}
+
+struct cgroup_subsys oom_subsys = {
+ .name = "oom",
+ .subsys_id = oom_subsys_id,
+ .create = oom_create,
+ .destroy = oom_destroy,
+ .populate = oom_populate,
+};