Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1763926AbZAULLQ (ORCPT ); Wed, 21 Jan 2009 06:11:16 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754768AbZAULLA (ORCPT ); Wed, 21 Jan 2009 06:11:00 -0500 Received: from mx2.suse.de ([195.135.220.15]:54892 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754599AbZAULK7 (ORCPT ); Wed, 21 Jan 2009 06:10:59 -0500 From: Nikanth Karthikesan Organization: suse.de To: linux-kernel@vger.kernel.org Subject: [RFC] [PATCH] Cgroup based OOM killer controller Date: Wed, 21 Jan 2009 16:38:21 +0530 User-Agent: KMail/1.10.3 (Linux/2.6.27.7-9-default; KDE/4.1.3; x86_64; ; ) Cc: Andrew Morton , Linus Torvalds , Evgeniy Polyakov , Chris Snook , Alan Cox , Arve =?iso-8859-1?q?Hj=F8nnev=E5g?= , Paul Menage , containers@lists.linux-foundation.org MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Content-Disposition: inline Message-Id: <200901211638.23101.knikanth@suse.de> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7479 Lines: 270 As Alan Cox suggested/wondered in this thread, http://lkml.org/lkml/2009/1/12/235 , this is a container group based approach to override the oom killer selection without losing all the benefits of the current oom killer heuristics and oom_adj interface. It adds a tunable oom.victim to the oom cgroup. The oom killer will kill the process using the usual badness value but only within the cgroup with the maximum value for oom.victim before killing any process from a cgroup with a lesser oom.victim number. Oom killing could be disabled by setting oom.victim=0. Signed-off-by: Nikanth Karthikesan --- diff --git a/Documentation/cgroups/oom.txt b/Documentation/cgroups/oom.txt new file mode 100644 index 0000000..9102830 --- /dev/null +++ b/Documentation/cgroups/oom.txt @@ -0,0 +1,29 @@ +OOM Killer controller +--- ------ ---------- + +The OOM killer kills the process based on a set of heuristics such that only +minimum amount of work done will be lost, a large amount of memory would be +recovered and minimum no of processes are killed. + +The user can adjust the score used to select the processes to be killed using +/proc//oom_adj. Giving it a high score will increase the likelihood of +this process being killed by the oom-killer. Valid values are in the range +-16 to +15, plus the special value -17, which disables oom-killing altogether +for that process. + +But it is very difficult to suggest an order among tasks to be killed during +Out Of Memory situation. The OOM Killer controller aids in doing that. + +USAGE +----- + +Mount the oom controller by passing 'oom' when mounting cgroups. Echo +a value in oom.victim file to change the order. The oom killer would kill +all the processes in a cgroup with a higher oom.victim value before killing a +process in a cgroup with lower oom.victim value. Among those tasks with same +oom.victim value, the usual badness heuristics would be applied. The +/proc//oom_adj still helps adjusting the oom killer score. Also having +oom.victim = 0 would disable oom killing for the tasks in that cgroup. + +Note: If this is used without proper consideration, innocent processes may +get killed unnecesarily. diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..6944f99 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -59,4 +59,8 @@ SUBSYS(freezer) SUBSYS(net_cls) #endif +#ifdef CONFIG_CGROUP_OOM +SUBSYS(oom) +#endif + /* */ diff --git a/include/linux/oomcontrol.h b/include/linux/oomcontrol.h new file mode 100644 index 0000000..18146ab --- /dev/null +++ b/include/linux/oomcontrol.h @@ -0,0 +1,14 @@ +#ifndef _LINUX_OOMCONTROL_H +#define _LINUX_OOMCONTROL_H + +#ifdef CONFIG_CGROUP_OOM +struct oom_cgroup { + struct cgroup_subsys_state css; + /* + * the order to be victimized for this group + */ + u64 victim; +}; + +#endif +#endif diff --git a/init/Kconfig b/init/Kconfig index 2af8382..99ed0de 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -354,6 +354,15 @@ config CGROUP_DEBUG Say N if unsure. +config CGROUP_OOM + bool "Oom cgroup subsystem" + depends on CGROUPS + help + This provides a cgroup subsystem which aids controlling + the order in which tasks whould be killed during + out of memory situations. + + config CGROUP_NS bool "Namespace cgroup subsystem" depends on CGROUPS diff --git a/mm/Makefile b/mm/Makefile index 72255be..a5d7222 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,3 +33,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_OOM) += oomcontrol.o diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 40ba050..f697d50 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include #include #include +#include #include int sysctl_panic_on_oom; @@ -205,6 +206,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, struct task_struct *g, *p; struct task_struct *chosen = NULL; struct timespec uptime; +#ifdef CONFIG_CGROUP_OOM + u64 chosenvictim = 1, taskvictim; +#endif *ppoints = 0; do_posix_clock_monotonic_gettime(&uptime); @@ -257,10 +261,26 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, continue; points = badness(p, uptime.tv_sec); +#ifdef CONFIG_CGROUP_OOM + taskvictim = (container_of(p->cgroups->subsys[oom_subsys_id], + struct oom_cgroup, css))->victim; + + if (taskvictim > chosenvictim || + (taskvictim == chosenvictim && points > *ppoints) || + !chosen) { + + chosen = p; + *ppoints = points; + chosenvictim = taskvictim; + + } + +#else if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } +#endif } while_each_thread(g, p); return chosen; diff --git a/mm/oomcontrol.c b/mm/oomcontrol.c new file mode 100644 index 0000000..2bfa325 --- /dev/null +++ b/mm/oomcontrol.c @@ -0,0 +1,95 @@ +/* + * kernel/cgroup_oom.c - oom handler cgroup. + */ + +#include +#include +#include +#include + +/* + * Helper to retrieve oom controller data from cgroup + */ +static struct oom_cgroup *oom_css_from_cgroup(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, + oom_subsys_id), struct oom_cgroup, + css); +} + + +static struct cgroup_subsys_state *oom_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct oom_cgroup *oom_css = kzalloc(sizeof(*oom_css), GFP_KERNEL); + struct oom_cgroup *parent; + + if (!oom_css) + return ERR_PTR(-ENOMEM); + + /* + * if root last/only group to be victimized + * else inherit parents value + */ + if (cont->parent == NULL) + oom_css->victim = 1; + else { + parent = oom_css_from_cgroup(cont->parent); + oom_css->victim = parent->victim; + } + + return &oom_css->css; +} + +static void oom_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[oom_subsys_id]); +} + +static int oom_victim_write(struct cgroup *cgrp, struct cftype *cft, + u64 val) +{ + + cgroup_lock(); + + (oom_css_from_cgroup(cgrp))->victim = val; + + cgroup_unlock(); + + return 0; +} + +static u64 oom_victim_read(struct cgroup *cgrp, struct cftype *cft) +{ + u64 victim = (oom_css_from_cgroup(cgrp))->victim; + + return victim; +} + + +static struct cftype oom_cgroup_files[] = { + { + .name = "victim", + .read_u64 = oom_victim_read, + .write_u64 = oom_victim_write, + }, +}; + +static int oom_populate(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + int ret; + + ret = cgroup_add_files(cont, ss, oom_cgroup_files, + ARRAY_SIZE(oom_cgroup_files)); + + return ret; +} + +struct cgroup_subsys oom_subsys = { + .name = "oom", + .subsys_id = oom_subsys_id, + .create = oom_create, + .destroy = oom_destroy, + .populate = oom_populate, +}; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/