Hi all,
We have done a comparison between the kernel version and user space
version and apparently the behavior is similar. You can also get this
patch and module to test it and compare with kernel OOM Killer. Here
goes a patch and a module that moves the kernel space OOM Killer
algorithm to user space. Let us know about your ideas.
*******************
PATCH
*******************
diff -urN linux-2.6.10/fs/proc/array.c linux-2.6.10-oom/fs/proc/array.c
--- linux-2.6.10/fs/proc/array.c 2004-12-24 17:35:00.000000000 -0400
+++ linux-2.6.10-oom/fs/proc/array.c 2005-01-10 15:42:26.000000000 -0400
@@ -470,3 +470,13 @@
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
size, resident, shared, text, lib, data, 0);
}
+
+int proc_pid_oom(struct task_struct *task, char * buffer)
+{
+ int res;
+ res = sprintf(buffer, "%d %lu %lu\n",
+ task->pid,
+ task->utime,
+ task->stime);
+ return res;
+}
diff -urN linux-2.6.10/fs/proc/base.c linux-2.6.10-oom/fs/proc/base.c
--- linux-2.6.10/fs/proc/base.c 2004-12-24 17:35:00.000000000 -0400
+++ linux-2.6.10-oom/fs/proc/base.c 2005-01-10 15:42:26.000000000 -0400
@@ -60,6 +60,7 @@
PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_OOM,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif
@@ -86,6 +87,7 @@
PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_OOM,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif
@@ -132,6 +134,7 @@
#ifdef CONFIG_SCHEDSTATS
E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
#endif
+ E(PROC_TGID_OOM, "oom", S_IFREG|S_IRUGO),
{0,0,NULL,0}
};
static struct pid_entry tid_base_stuff[] = {
@@ -157,6 +160,7 @@
#ifdef CONFIG_SCHEDSTATS
E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
#endif
+ E(PROC_TID_OOM, "oom", S_IFREG|S_IRUGO),
{0,0,NULL,0}
};
@@ -193,6 +197,7 @@
int proc_tgid_stat(struct task_struct*,char*);
int proc_pid_status(struct task_struct*,char*);
int proc_pid_statm(struct task_struct*,char*);
+int proc_pid_oom(struct task_struct*,char*);
static int proc_fd_link(struct inode *inode, struct dentry **dentry,
struct vfsmount **mnt)
{
@@ -1377,6 +1382,11 @@
ei->op.proc_read = proc_pid_schedstat;
break;
#endif
+ case PROC_TID_OOM:
+ case PROC_TGID_OOM:
+ inode->i_fop = &proc_info_file_operations;
+ ei->op.proc_read = proc_pid_oom;
+ break;
default:
printk("procfs: impossible type (%d)",p->type);
iput(inode);
diff -urN linux-2.6.10/include/linux/oom_kill.h
linux-2.6.10-oom/include/linux/oom_kill.h
--- linux-2.6.10/include/linux/oom_kill.h 1969-12-31 20:00:00.000000000 -0400
+++ linux-2.6.10-oom/include/linux/oom_kill.h 2005-01-10
15:42:26.000000000 -0400
@@ -0,0 +1,6 @@
+struct candidate_process {
+ pid_t pid;
+ struct list_head pid_list;
+};
+
+struct list_head *loop_counter;
diff -urN linux-2.6.10/mm/#oom_kill.c# linux-2.6.10-oom/mm/#oom_kill.c#
--- linux-2.6.10/mm/#oom_kill.c# 2005-01-10 16:08:07.000000000 -0400
+++ linux-2.6.10-oom/mm/#oom_kill.c# 1969-12-31 20:00:00.000000000 -0400
@@ -1,366 +0,0 @@
-/*
- * linux/mm/oom_kill.c
- *
- * Copyright (C) 1998,2000 Rik van Riel
- * Thanks go out to Claus Fischer for some serious inspiration and
- * for goading me into coding this file...
- *
- * The routines in this file are used to kill a process when
- * we're seriously out of memory. This gets called from kswapd()
- * in linux/mm/vmscan.c when we really run out of memory.
- *
- * Since we won't call these routines often (on a well-configured
- * machine) this file will double as a 'coding guide' and a signpost
- * for newbie kernel hackers. It features several pointers to major
- * kernel subsystems and hints as to where to find out what things do.
- *
- *
- * 2005
- * Bruna Moreira <[email protected]>
- * Edjard Mota <[email protected]>
- * Ilias Biris <[email protected]>
- * Mauricio Lin <[email protected]>
- *
- * Embedded Linux Lab - 10LE Institulo Nokia de Tecnologia - INdT
- *
- * Turn off the kernel space out of memory killer algorithm and provide
- * support for user space out of memory killer.
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/timex.h>
-#include <linux/jiffies.h>
-#include <linux/oom_kill.h>
-
-/* #define DEBUG */
-
-/**
- * oom_badness - calculate a numeric value for how bad this task has been
- * @p: task struct of which task we should calculate
- * @p: current uptime in seconds
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
- *
- * Good in this context means that:
- * 1) we lose the minimum amount of work done
- * 2) we recover a large amount of memory
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- * algorithm has been meticulously tuned to meet the principle
- * of least surprise ... (be careful when you change it)
- */
-
-struct candidate_process *candidate;
-
-EXPORT_SYMBOL(candidate);
-
-LIST_HEAD(pidqueue_head);
-
-EXPORT_SYMBOL(pidqueue_head);
-/*
-void show_candidate_comm(void)
-{
- struct task_struct *g, *p;
- int i = 0;
-
- list_for_each(loop_counter, &pidqueue_head) {
- candidate = list_entry(loop_counter, struct candidate_process, pid_list);
- do_each_thread(g, p)
- if (p->pid == candidate->pid) {
- printk(KERN_DEBUG "A good walker leaves no tracks.%s\n", p->comm);
- goto outer_loop;
- }
- while_each_thread(g, p);
- outer_loop:
- continue;
- }
-}
-
-EXPORT_SYMBOL(show_candidate_comm);
-*/
-static struct task_struct * select_process(void)
-{
- struct task_struct *g, *p;
- struct task_struct *chosen = NULL;
-
- if (!list_empty(&pidqueue_head)) {
- struct list_head *tmp;
- list_for_each_safe(loop_counter, tmp, &pidqueue_head) {
- candidate = list_entry(loop_counter, struct candidate_process, pid_list);
- do_each_thread(g, p)
- if (p->pid == candidate->pid) {
- chosen = p;
- list_del(&candidate->pid_list);
- kfree(candidate);
- goto exit;
- }
- while_each_thread(g, p);
- }
- }
- exit:
- return chosen;
-}
-
-static unsigned long badness(struct task_struct *p, unsigned long uptime)
-{
- unsigned long points, cpu_time, run_time, s;
-
- if (!p->mm)
- return 0;
-
- if (p->flags & PF_MEMDIE)
- return 0;
- /*
- * The memory size of the process is the basis for the badness.
- */
- points = p->mm->total_vm;
-
- /*
- * CPU time is in tens of seconds and run time is in thousands
- * of seconds. There is no particular reason for this other than
- * that it turned out to work very well in practice.
- */
- cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
-
- if (uptime >= p->start_time.tv_sec)
- run_time = (uptime - p->start_time.tv_sec) >> 10;
- else
- run_time = 0;
-
- s = int_sqrt(cpu_time);
- if (s)
- points /= s;
- s = int_sqrt(int_sqrt(run_time));
- if (s)
- points /= s;
-
- /*
- * Niced processes are most likely less important, so double
- * their badness points.
- */
- if (task_nice(p) > 0)
- points *= 2;
-
- /*
- * Superuser processes are usually more important, so we make it
- * less likely that we kill those.
- */
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
- p->uid == 0 || p->euid == 0)
- points /= 4;
-
- /*
- * We don't want to kill a process with direct hardware access.
- * Not only could that mess up the hardware, but usually users
- * tend to only have this flag set on applications they think
- * of as important.
- */
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
- points /= 4;
-#ifdef DEBUG
- printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
- p->pid, p->comm, points);
-#endif
- return points;
-}
-
-/*
- * Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
- */
-/*
-static struct task_struct * select_bad_process(void)
-{
- unsigned long maxpoints = 0;
- struct task_struct *g, *p;
- struct task_struct *chosen = NULL;
- struct timespec uptime;
-
- do_posix_clock_monotonic_gettime(&uptime);
- do_each_thread(g, p)
- if (p->pid) {
- unsigned long points = badness(p, uptime.tv_sec);
- if (points > maxpoints) {
- chosen = p;
- maxpoints = points;
- }
- if (p->flags & PF_SWAPOFF)
- return p;
- }
- while_each_thread(g, p);
- return chosen;
-}
-*/
-/**
- * We must be careful though to never send SIGKILL a process with
- * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
- * we select a process with CAP_SYS_RAW_IO set).
- */
-static void __oom_kill_task(task_t *p)
-{
- task_lock(p);
- if (!p->mm || p->mm == &init_mm) {
- WARN_ON(1);
- printk(KERN_WARNING "tried to kill an mm-less task!\n");
- task_unlock(p);
- return;
- }
- task_unlock(p);
- printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
-
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- p->time_slice = HZ;
- p->flags |= PF_MEMALLOC | PF_MEMDIE;
-
- /* This process has hardware access, be more careful. */
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
- force_sig(SIGTERM, p);
- } else {
- force_sig(SIGKILL, p);
- }
-}
-
-static struct mm_struct *oom_kill_task(task_t *p)
-{
- struct mm_struct *mm = get_task_mm(p);
- if (!mm || mm == &init_mm)
- return NULL;
- __oom_kill_task(p);
- return mm;
-}
-
-
-/**
- * oom_kill - kill the "best" process when we run out of memory
- *
- * If we run out of memory, we have the choice between either
- * killing a random task (bad), letting the system crash (worse)
- * OR try to be smart about which process to kill. Note that we
- * don't have to be perfect here, we just have to be good.
- */
-static void oom_kill(void)
-{
- struct mm_struct *mm;
- struct task_struct *g, *p, *q;
-
- read_lock(&tasklist_lock);
-retry:
- printk(KERN_DEBUG "A good walker leaves no tracks.\n");
- p = select_process();
-
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- show_free_areas();
- panic("Out of memory and no killable processes...\n");
- }
-
- mm = oom_kill_task(p);
- if (!mm)
- goto retry;
- /*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
- */
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
- while_each_thread(g, q);
- if (!p->mm)
- printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
- read_unlock(&tasklist_lock);
- mmput(mm);
-
- /*
- * Make kswapd go out of the way, so "p" has a good chance of
- * killing itself before someone else gets the chance to ask
- * for more memory.
- */
- yield();
- return;
-}
-
-/**
- * out_of_memory - is the system out of memory?
- */
-void out_of_memory(int gfp_mask)
-{
- /*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
- static unsigned long first, last, count, lastkill;
- unsigned long now, since;
-
- spin_lock(&oom_lock);
- now = jiffies;
- since = now - last;
- last = now;
-
- /*
- * If it's been a long time since last failure,
- * we're not oom.
- */
- if (since > 5*HZ)
- goto reset;
-
- /*
- * If we haven't tried for at least one second,
- * we're not really oom.
- */
- since = now - first;
- if (since < HZ)
- goto out_unlock;
-
- /*
- * If we have gotten only a few failures,
- * we're not really oom.
- */
- if (++count < 10)
- goto out_unlock;
-
- /*
- * If we just killed a process, wait a while
- * to give that task a chance to exit. This
- * avoids killing multiple processes needlessly.
- */
- since = now - lastkill;
- if (since < HZ*5)
- goto out_unlock;
-
- /*
- * Ok, really out of memory. Kill something.
- */
- lastkill = now;
-
- printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
- show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
- oom_kill();
- spin_lock(&oom_lock);
-
-reset:
- /*
- * We dropped the lock above, so check to be sure the variable
- * first only ever increases to prevent false OOM's.
- */
- if (time_after(now, first))
- first = now;
- count = 0;
-
-out_unlock:
- spin_unlock(&oom_lock);
-}
diff -urN linux-2.6.10/mm/oom_kill.c linux-2.6.10-oom/mm/oom_kill.c
--- linux-2.6.10/mm/oom_kill.c 2004-12-24 17:34:57.000000000 -0400
+++ linux-2.6.10-oom/mm/oom_kill.c 2005-01-10 15:53:18.000000000 -0400
@@ -13,13 +13,26 @@
* machine) this file will double as a 'coding guide' and a signpost
* for newbie kernel hackers. It features several pointers to major
* kernel subsystems and hints as to where to find out what things do.
+ *
+ *
+ * 2004
+ * Bruna Moreira <[email protected]>
+ * Edjard Mota <[email protected]>
+ * Ilias Biris <[email protected]>
+ * Mauricio Lin <[email protected]>
+ *
+ * Embedded Linux Lab - 10LE Institulo Nokia de Tecnologia - INdT
+ *
+ * Turn off the kernel space out of memory killer algorithm and provide
+ * support for user space out of memory killer.
*/
-
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
+#include <linux/oom_kill.h>
/* #define DEBUG */
@@ -42,6 +55,57 @@
* of least surprise ... (be careful when you change it)
*/
+struct candidate_process *candidate;
+
+EXPORT_SYMBOL(candidate);
+
+LIST_HEAD(pidqueue_head);
+
+EXPORT_SYMBOL(pidqueue_head);
+/*
+void show_candidate_comm(void)
+{
+ struct task_struct *g, *p;
+ int i = 0;
+
+ list_for_each(loop_counter, &pidqueue_head) {
+ candidate = list_entry(loop_counter, struct candidate_process, pid_list);
+ do_each_thread(g, p)
+ if (p->pid == candidate->pid) {
+ printk(KERN_DEBUG "A good walker leaves no tracks.%s\n", p->comm);
+ goto outer_loop;
+ }
+ while_each_thread(g, p);
+ outer_loop:
+ continue;
+ }
+}
+
+EXPORT_SYMBOL(show_candidate_comm);
+*/
+static struct task_struct * select_process(void)
+{
+ struct task_struct *g, *p;
+ struct task_struct *chosen = NULL;
+
+ if (!list_empty(&pidqueue_head)) {
+ struct list_head *tmp;
+ list_for_each_safe(loop_counter, tmp, &pidqueue_head) {
+ candidate = list_entry(loop_counter, struct candidate_process, pid_list);
+ do_each_thread(g, p)
+ if (p->pid == candidate->pid) {
+ chosen = p;
+ list_del(&candidate->pid_list);
+ kfree(candidate);
+ goto exit;
+ }
+ while_each_thread(g, p);
+ }
+ }
+ exit:
+ return chosen;
+}
+
static unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
@@ -111,6 +175,7 @@
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
+/*
static struct task_struct * select_bad_process(void)
{
unsigned long maxpoints = 0;
@@ -132,7 +197,7 @@
while_each_thread(g, p);
return chosen;
}
-
+*/
/**
* We must be careful though to never send SIGKILL a process with
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
@@ -191,7 +256,8 @@
read_lock(&tasklist_lock);
retry:
- p = select_bad_process();
+ printk(KERN_DEBUG "A good walker leaves no tracks.\n");
+ p = select_process();
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
******************
Module oom.c
******************
/* 2005
* Bruna Moreira <[email protected]>
* Edjard Mota <[email protected]>
* Ilias Biris <[email protected]>
* Mauricio Lin <[email protected]>
*
* Embedded Linux Lab - 10LE Institulo Nokia de Tecnologia - INdT
*
* Create a /proc/oom that allows user space Out of Memory Killer
* to write the list of pids through it to the kernel. When out of
* memory happens the kernel selects the process to be killed from
* /proc/oom.
*
* Approach suggested by Tony Lindgren <[email protected]>
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/oom_kill.h>
#include <asm/uaccess.h>
#define MODULE_NAME "oom"
extern struct candidate_process *candidate;
static unsigned int nr_pids;
static struct proc_dir_entry *oom_file;
extern struct list_head pidqueue_head;
static DECLARE_MUTEX(user_oom_lock);
static unsigned long size_of_bytes;
static inline void add_to_pidqueue(struct candidate_process * p)
{
list_add_tail(&p->pid_list, &pidqueue_head);
nr_pids++;
//printk(KERN_DEBUG "add nr_pids = %d (%d) \n", nr_pids, p->pid);
}
static inline void del_from_pidqueue(struct candidate_process * p)
{
nr_pids--;
//printk(KERN_DEBUG "delete nr_pids = %d \n", nr_pids);
list_del(&p->pid_list);
}
static inline void del_all_from_pidqueue(void)
{
if (!list_empty(&pidqueue_head)) {
struct list_head *tmp;
list_for_each_safe(loop_counter, tmp, &pidqueue_head) {
candidate = list_entry(loop_counter, struct candidate_process, pid_list);
del_from_pidqueue(candidate);
kfree(candidate);
}
}
}
static int proc_read_oom(char *page, char **start,
off_t off, int count,
int *eof, void *data)
{
int len = 0;
char *output;
char *item;
if (!list_empty(&pidqueue_head)) {
item = kmalloc(6, GFP_KERNEL);
output = kmalloc(size_of_bytes, GFP_KERNEL);
strcpy(output, "");
list_for_each(loop_counter, &pidqueue_head) {
candidate = list_entry(loop_counter, struct candidate_process, pid_list);
sprintf(item, "%d ", candidate->pid);
strcat(output, item);
}
strcat(output, "\n");
len = sprintf(page, output);
kfree(output);
kfree(item);
}
else {
len = sprintf(page, "\n");
}
return len;
}
static int proc_write_oom(struct file *file,
const char *buffer,
unsigned long count,
void *data)
{
unsigned long len;
char *input, *item;
len = count;
input = kmalloc(count+1, GFP_KERNEL);
if (!input)
return -ENOMEM;
if(copy_from_user(input, buffer, count)) {
kfree(input);
return -EFAULT;
}
input[count] = ' ';
len = strlen(input);
int prev_index = 0;
int i;
if (down_interruptible(&user_oom_lock)) {
kfree(input);
return -EINTR;
}
del_all_from_pidqueue();
for (i=0; i<len; i++) {
if (input[i] == ' ') {
input[i] = '\0';
item = kmalloc(i-prev_index, GFP_KERNEL);
strcpy(item, &input[prev_index]);
prev_index = i+1;
candidate = (struct candidate_process *)kmalloc(sizeof(struct
candidate_process), GFP_KERNEL);
candidate->pid = simple_strtoul(item, NULL, 10);
add_to_pidqueue(candidate);
kfree(item);
}
}
size_of_bytes = len;
up(&user_oom_lock);
kfree(input);
return count;
}
static int __init init_oom(void)
{
int flag = 0;
oom_file = create_proc_entry("oom", S_IRUGO | S_IWUSR, NULL);
oom_file->read_proc = proc_read_oom;
oom_file->write_proc = proc_write_oom;
oom_file->owner = THIS_MODULE;
nr_pids = 0;
size_of_bytes = 0;
printk(KERN_DEBUG "%s included\n", MODULE_NAME);
return flag;
}
static void __exit cleanup_oom(void)
{
del_all_from_pidqueue();
remove_proc_entry("oom", NULL);
printk(KERN_DEBUG "%s removed\n", MODULE_NAME);
}
module_init(init_oom);
module_exit(cleanup_oom);
MODULE_LICENSE("GPL");
***********************
Makefile for module
***********************
obj-m := oom.o
MAKE := /usr/bin/make
KDIR := $(HOME)/linux-2.6.10-oom
PWD := $(PWD)
default:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules
************************
README
************************
1. Getting the kernel
Create a directory called "download" in your home and go to this directory
# mkdir download
# cd download
Download the kernel 2.6.9 from
ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-2.6.10.tar.bz2 to the
directory you have created previuosly.
Extract it in your home as below
# tar xjvf download/linux-2.6.10.tar.bz2
Rename the kernel directory to linux-2.6.10-oom
# mv linux-2.6.10 linux-2.6.10-oom
Go to the kernel source tree and patch the kernel using "2.6.9.oom.patch"
# cd linux-2.6.10-oom
# patch -p1 < ../2.6.10.oom.patch
The command above assumes that you have put the patch file in your
home directory.
Configure you kernel using "make menuconfig" or "make xconfig"
2. Compiling the kernel module
After compiling your kernel you have to compile the kernel module "oom.c"
Put the Makefile and oom.c in the same directory.
Type make in order to create the "oom.ko" as below
# make
After typing make, check if it was created the "oom.ko" in your
current directory.
3. Run the oom module
You have to insert the module now as root:
# insmod oom.ko
Check if the /proc/oom was created in your system.
# cat /proc/oom
#
# echo 11 22 33 > /proc/oom
# cat /proc/oom
# 11 22 33
4. User OOM (Optional)
After that you can compile the user_oom.c as below (you can compile it
as normal user):
# gcc user_oom.c -o user_oom -lm
As root you have to run it:
# ./user_oom
************************
TODO
************************
Reorganize the kernel and user space code (remove some superfluous
stuff or move them to a more viable file) in order to reduce the
stack space or make the code more readable and clear.
Change some static allocation to dynamic allocation as array to linked list.
Create a /dev/oom to get the memory info instead of reading it from
/proc/meminfo continuously.
Suggestions are welcome. The user space OOM application (optional)
will be sent later.
BR,
Mauricio Lin.
On Mon, Jan 10, 2005 at 05:43:23PM -0400, Mauricio Lin wrote:
> Hi all,
>
> We have done a comparison between the kernel version and user space
> version and apparently the behavior is similar. You can also get this
> patch and module to test it and compare with kernel OOM Killer. Here
> goes a patch and a module that moves the kernel space OOM Killer
> algorithm to user space. Let us know about your ideas.
No comments on the code itself - It is interesting to have certain pids "not selectable" by
the OOM killer. Patches which have similar funcionality have been floating around.
The userspace OOM killer is dangerous though. You have to guarantee that allocations
will NOT happen until the OOM killer is executed and the killed process is dead and
has its pages freed - allocations under OOM can cause deadlocks.
"OOM-killer-in-userspace" is unreliable, not sure if its worth the effort making
it reliable (mlock it, flagged as PF_MEMALLOC, etc).
On Mon, Jan 10, 2005 at 05:20:13PM -0200, Marcelo Tosatti wrote:
> On Mon, Jan 10, 2005 at 05:43:23PM -0400, Mauricio Lin wrote:
> > Hi all,
> >
> > We have done a comparison between the kernel version and user space
> > version and apparently the behavior is similar. You can also get this
> > patch and module to test it and compare with kernel OOM Killer. Here
> > goes a patch and a module that moves the kernel space OOM Killer
> > algorithm to user space. Let us know about your ideas.
>
> No comments on the code itself - It is interesting to have certain pids "not selectable" by
> the OOM killer. Patches which have similar funcionality have been floating around.
>
> The userspace OOM killer is dangerous though. You have to guarantee that allocations
> will NOT happen until the OOM killer is executed and the killed process is dead and
> has its pages freed - allocations under OOM can cause deadlocks.
>
> "OOM-killer-in-userspace" is unreliable, not sure if its worth the effort making
> it reliable (mlock it, flagged as PF_MEMALLOC, etc).
Actually its only unreliable if its called from OOM time.
The case here is you have a daemon which periodically writes
to /proc/oom ?
Hi,
I guess it the idea was not fully and well explained. It is not the OOM Killer
itself that was moved to user space but rather its ranking algorithm.
Ranking is not an specific functionality of kernel space. Kernel only need
to know which process whould be killed.
In that sense the approach is different and might be worth testing, mainly for
cases where we want to allow better policies of ranking. For example, an
embedded device with few resources and important different running applications:
whic one is the best? To my understanding the current ranking policy
does not necessarily chooses the best one to be killed.
br
Edjard
On Mon, 10 Jan 2005 17:20:13 -0200, Marcelo Tosatti
<[email protected]> wrote:
> On Mon, Jan 10, 2005 at 05:43:23PM -0400, Mauricio Lin wrote:
> > Hi all,
> >
> > We have done a comparison between the kernel version and user space
> > version and apparently the behavior is similar. You can also get this
> > patch and module to test it and compare with kernel OOM Killer. Here
> > goes a patch and a module that moves the kernel space OOM Killer
> > algorithm to user space. Let us know about your ideas.
>
> No comments on the code itself - It is interesting to have certain pids "not selectable" by
> the OOM killer. Patches which have similar funcionality have been floating around.
>
> The userspace OOM killer is dangerous though. You have to guarantee that allocations
> will NOT happen until the OOM killer is executed and the killed process is dead and
> has its pages freed - allocations under OOM can cause deadlocks.
>
> "OOM-killer-in-userspace" is unreliable, not sure if its worth the effort making
> it reliable (mlock it, flagged as PF_MEMALLOC, etc).
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
"In a world without fences ... who needs Gates?"
On Mon, 10 Jan 2005 17:39:53 -0200, Marcelo Tosatti
<[email protected]> wrote:
> On Mon, Jan 10, 2005 at 05:20:13PM -0200, Marcelo Tosatti wrote:
> > On Mon, Jan 10, 2005 at 05:43:23PM -0400, Mauricio Lin wrote:
> > > Hi all,
> > >
> > > We have done a comparison between the kernel version and user space
> > > version and apparently the behavior is similar. You can also get this
> > > patch and module to test it and compare with kernel OOM Killer. Here
> > > goes a patch and a module that moves the kernel space OOM Killer
> > > algorithm to user space. Let us know about your ideas.
> >
> > No comments on the code itself - It is interesting to have certain pids "not selectable" by
> > the OOM killer. Patches which have similar funcionality have been floating around.
> >
> > The userspace OOM killer is dangerous though. You have to guarantee that allocations
> > will NOT happen until the OOM killer is executed and the killed process is dead and
> > has its pages freed - allocations under OOM can cause deadlocks.
> >
> > "OOM-killer-in-userspace" is unreliable, not sure if its worth the effort making
> > it reliable (mlock it, flagged as PF_MEMALLOC, etc).
>
> Actually its only unreliable if its called from OOM time.
>
> The case here is you have a daemon which periodically writes
> to /proc/oom ?
Yes, let me explain the idea.
When the memory consumption reaches a percentage of usage, as 98% or
something like that, we call this as red zone. So when red zone is
reached, the ranking algorithm is started to select which processes
could be killed whe out of memory happens. If the memory comsumption
is less than this percentage (not in red zone), the ranking algorithm
is not executed. So we have a loop the check the memory comsumption
all the time and if the red zone is reached, the ranking algorithm is
started before the system get the out of memory state.
BR,
Mauricio Lin.
On Tue, Jan 11, 2005 at 12:40:24AM +0200, Edjard Souza Mota wrote:
> Hi,
>
> I guess it the idea was not fully and well explained. It is not the OOM Killer
> itself that was moved to user space but rather its ranking algorithm.
> Ranking is not an specific functionality of kernel space. Kernel only need
> to know which process whould be killed.
>
> In that sense the approach is different and might be worth testing, mainly for
> cases where we want to allow better policies of ranking. For example, an
> embedded device with few resources and important different running applications:
> whic one is the best? To my understanding the current ranking policy
> does not necessarily chooses the best one to be killed.
Sorry, I misunderstood. Should have read the code before shouting.
The feature is interesting - several similar patches have been around with similar
functionality (people who need usually write their own, I've seen a few), but none
has ever been merged, even though it is an important requirement for many users.
This is simple, an ordered list of candidate PIDs. IMO something similar to this
should be merged. Andrew ?
Few comments about the code:
retry:
- p = select_bad_process();
+ printk(KERN_DEBUG "A good walker leaves no tracks.\n");
+ p = select_process();
You want to fallback to select_bad_process() if no candidate has been selected at
select_process().
You also want to move "oom" to /proc/sys/vm/.
Hi,
>
> Sorry, I misunderstood. Should have read the code before shouting.
Better shouting then shooting :)!
br
Edjard
>
> The feature is interesting - several similar patches have been around with similar
> functionality (people who need usually write their own, I've seen a few), but none
> has ever been merged, even though it is an important requirement for many users.
>
> This is simple, an ordered list of candidate PIDs. IMO something similar to this
> should be merged. Andrew ?
>
> Few comments about the code:
>
> retry:
> - p = select_bad_process();
> + printk(KERN_DEBUG "A good walker leaves no tracks.\n");
> + p = select_process();
>
> You want to fallback to select_bad_process() if no candidate has been selected at
> select_process().
>
> You also want to move "oom" to /proc/sys/vm/.
>
>
--
"In a world without fences ... who needs Gates?"
On Mon, 10 Jan 2005 18:05:14 -0200, Marcelo Tosatti
<[email protected]> wrote:
> On Tue, Jan 11, 2005 at 12:40:24AM +0200, Edjard Souza Mota wrote:
> > Hi,
> >
> > I guess it the idea was not fully and well explained. It is not the OOM Killer
> > itself that was moved to user space but rather its ranking algorithm.
> > Ranking is not an specific functionality of kernel space. Kernel only need
> > to know which process whould be killed.
> >
> > In that sense the approach is different and might be worth testing, mainly for
> > cases where we want to allow better policies of ranking. For example, an
> > embedded device with few resources and important different running applications:
> > whic one is the best? To my understanding the current ranking policy
> > does not necessarily chooses the best one to be killed.
>
> Sorry, I misunderstood. Should have read the code before shouting.
>
> The feature is interesting - several similar patches have been around with similar
> functionality (people who need usually write their own, I've seen a few), but none
> has ever been merged, even though it is an important requirement for many users.
>
> This is simple, an ordered list of candidate PIDs. IMO something similar to this
> should be merged. Andrew ?
>
> Few comments about the code:
>
> retry:
> - p = select_bad_process();
> + printk(KERN_DEBUG "A good walker leaves no tracks.\n");
> + p = select_process();
>
> You want to fallback to select_bad_process() if no candidate has been selected at
> select_process().
The idea is turn off the select_bad_process() and the new
select_process() will get the list of pids to be killed from
/proc/oom. But the ranking algorithms is the same, I mean is the Rik
van Riel algorithm. Do you think it is worthwhile to maintain the
select_bad_process (kernel space algorithm) if we have the
select_process() function?
>
> You also want to move "oom" to /proc/sys/vm/.
This can be possible. Do you think that it is a good place to move the oom?
BR,
Mauricio Lin.
Hi all,
I noticed something wrong with the first patch. Here goes the fixed patch:
***********
PATCH
***********
diff -urN linux-2.6.10/fs/proc/array.c linux-2.6.10-oom/fs/proc/array.c
--- linux-2.6.10/fs/proc/array.c 2004-12-24 17:35:00.000000000 -0400
+++ linux-2.6.10-oom/fs/proc/array.c 2005-01-10 15:42:26.000000000 -0400
@@ -470,3 +470,13 @@
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
size, resident, shared, text, lib, data, 0);
}
+
+int proc_pid_oom(struct task_struct *task, char * buffer)
+{
+ int res;
+ res = sprintf(buffer, "%d %lu %lu\n",
+ task->pid,
+ task->utime,
+ task->stime);
+ return res;
+}
diff -urN linux-2.6.10/fs/proc/base.c linux-2.6.10-oom/fs/proc/base.c
--- linux-2.6.10/fs/proc/base.c 2004-12-24 17:35:00.000000000 -0400
+++ linux-2.6.10-oom/fs/proc/base.c 2005-01-10 15:42:26.000000000 -0400
@@ -60,6 +60,7 @@
PROC_TGID_MAPS,
PROC_TGID_MOUNTS,
PROC_TGID_WCHAN,
+ PROC_TGID_OOM,
#ifdef CONFIG_SCHEDSTATS
PROC_TGID_SCHEDSTAT,
#endif
@@ -86,6 +87,7 @@
PROC_TID_MAPS,
PROC_TID_MOUNTS,
PROC_TID_WCHAN,
+ PROC_TID_OOM,
#ifdef CONFIG_SCHEDSTATS
PROC_TID_SCHEDSTAT,
#endif
@@ -132,6 +134,7 @@
#ifdef CONFIG_SCHEDSTATS
E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
#endif
+ E(PROC_TGID_OOM, "oom", S_IFREG|S_IRUGO),
{0,0,NULL,0}
};
static struct pid_entry tid_base_stuff[] = {
@@ -157,6 +160,7 @@
#ifdef CONFIG_SCHEDSTATS
E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
#endif
+ E(PROC_TID_OOM, "oom", S_IFREG|S_IRUGO),
{0,0,NULL,0}
};
@@ -193,6 +197,7 @@
int proc_tgid_stat(struct task_struct*,char*);
int proc_pid_status(struct task_struct*,char*);
int proc_pid_statm(struct task_struct*,char*);
+int proc_pid_oom(struct task_struct*,char*);
static int proc_fd_link(struct inode *inode, struct dentry **dentry,
struct vfsmount **mnt)
{
@@ -1377,6 +1382,11 @@
ei->op.proc_read = proc_pid_schedstat;
break;
#endif
+ case PROC_TID_OOM:
+ case PROC_TGID_OOM:
+ inode->i_fop = &proc_info_file_operations;
+ ei->op.proc_read = proc_pid_oom;
+ break;
default:
printk("procfs: impossible type (%d)",p->type);
iput(inode);
diff -urN linux-2.6.10/include/linux/oom_kill.h
linux-2.6.10-oom/include/linux/oom_kill.h
--- linux-2.6.10/include/linux/oom_kill.h 1969-12-31 20:00:00.000000000 -0400
+++ linux-2.6.10-oom/include/linux/oom_kill.h 2005-01-10
15:42:26.000000000 -0400
@@ -0,0 +1,6 @@
+struct candidate_process {
+ pid_t pid;
+ struct list_head pid_list;
+};
+
+struct list_head *loop_counter;
diff -urN linux-2.6.10/mm/oom_kill.c linux-2.6.10-oom/mm/oom_kill.c
--- linux-2.6.10/mm/oom_kill.c 2004-12-24 17:34:57.000000000 -0400
+++ linux-2.6.10-oom/mm/oom_kill.c 2005-01-10 17:56:34.000000000 -0400
@@ -13,13 +13,26 @@
* machine) this file will double as a 'coding guide' and a signpost
* for newbie kernel hackers. It features several pointers to major
* kernel subsystems and hints as to where to find out what things do.
+ *
+ *
+ * 2005
+ * Bruna Moreira <[email protected]>
+ * Edjard Mota <[email protected]>
+ * Ilias Biris <[email protected]>
+ * Mauricio Lin <[email protected]>
+ *
+ * Embedded Linux Lab - 10LE Institulo Nokia de Tecnologia - INdT
+ *
+ * Turn off the kernel space out of memory killer algorithm and provide
+ * support for user space out of memory killer.
*/
-
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
+#include <linux/oom_kill.h>
/* #define DEBUG */
@@ -42,6 +55,57 @@
* of least surprise ... (be careful when you change it)
*/
+struct candidate_process *candidate;
+
+EXPORT_SYMBOL(candidate);
+
+LIST_HEAD(pidqueue_head);
+
+EXPORT_SYMBOL(pidqueue_head);
+/*
+void show_candidate_comm(void)
+{
+ struct task_struct *g, *p;
+ int i = 0;
+
+ list_for_each(loop_counter, &pidqueue_head) {
+ candidate = list_entry(loop_counter, struct candidate_process, pid_list);
+ do_each_thread(g, p)
+ if (p->pid == candidate->pid) {
+ printk(KERN_DEBUG "A good walker leaves no tracks.%s\n", p->comm);
+ goto outer_loop;
+ }
+ while_each_thread(g, p);
+ outer_loop:
+ continue;
+ }
+}
+
+EXPORT_SYMBOL(show_candidate_comm);
+*/
+static struct task_struct * select_process(void)
+{
+ struct task_struct *g, *p;
+ struct task_struct *chosen = NULL;
+
+ if (!list_empty(&pidqueue_head)) {
+ struct list_head *tmp;
+ list_for_each_safe(loop_counter, tmp, &pidqueue_head) {
+ candidate = list_entry(loop_counter, struct candidate_process, pid_list);
+ do_each_thread(g, p)
+ if (p->pid == candidate->pid) {
+ chosen = p;
+ list_del(&candidate->pid_list);
+ kfree(candidate);
+ goto exit;
+ }
+ while_each_thread(g, p);
+ }
+ }
+ exit:
+ return chosen;
+}
+
static unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
@@ -111,6 +175,7 @@
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
+/*
static struct task_struct * select_bad_process(void)
{
unsigned long maxpoints = 0;
@@ -132,7 +197,7 @@
while_each_thread(g, p);
return chosen;
}
-
+*/
/**
* We must be careful though to never send SIGKILL a process with
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
@@ -191,7 +256,8 @@
read_lock(&tasklist_lock);
retry:
- p = select_bad_process();
+ printk(KERN_DEBUG "A good walker leaves no tracks.\n");
+ p = select_process();
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
BR,
Mauricio Lin.
actually 'than' is much better then :-)
On Tue, 11 Jan 2005 01:17:01 +0200, Edjard Souza Mota <[email protected]> wrote:
> Hi,
>
> >
> > Sorry, I misunderstood. Should have read the code before shouting.
>
> Better shouting then shooting :)!
>
> br
>
> Edjard
>
>
> >
> > The feature is interesting - several similar patches have been around with similar
> > functionality (people who need usually write their own, I've seen a few), but none
> > has ever been merged, even though it is an important requirement for many users.
> >
> > This is simple, an ordered list of candidate PIDs. IMO something similar to this
> > should be merged. Andrew ?
> >
> > Few comments about the code:
> >
> > retry:
> > - p = select_bad_process();
> > + printk(KERN_DEBUG "A good walker leaves no tracks.\n");
> > + p = select_process();
> >
> > You want to fallback to select_bad_process() if no candidate has been selected at
> > select_process().
> >
> > You also want to move "oom" to /proc/sys/vm/.
> >
> >
>
> --
> "In a world without fences ... who needs Gates?"
>
--
"In a world without fences ... who needs Gates?"
On Mon, 2005-01-10 at 18:05 -0200, Marcelo Tosatti wrote:
> The feature is interesting - several similar patches have been around with similar
> functionality (people who need usually write their own, I've seen a few), but none
> has ever been merged, even though it is an important requirement for many users.
It's not a requirement for users. The current implementation in the
kernel it's just broken, ugly code.
> This is simple, an ordered list of candidate PIDs. IMO something similar to this
> should be merged. Andrew ?
I have no objections against the userspace provided candidate list
option, but as long as the main sources of trouble
- invocation
- reentrancy
- timed, counted, blah ugly protection
- selection problem
are not fixed properly, we don't need to discuss the inclusion of a
userspace provided candidate list.
Postpone this until the main problem is fixed. There is a proper
confirmed fix for this available. It was posted more than once.
Merging a fix which helps only 0,001 % of the users to hide the mess
instead of fixing the real problem is a real interesting engineering
aproach.
I don't deny, that after the source of trouble is fixed it is worth to
think about the merging of this addon to allow interested users to
define the culprits instead of relying on an always imperfect selection
algorithm.
tglx
Some points on Thomas comments,
>
> I have no objections against the userspace provided candidate list
> option, but as long as the main sources of trouble
>
> - invocation
> - reentrancy
> - timed, counted, blah ugly protection
> - selection problem
>
> are not fixed properly, we don't need to discuss the inclusion of a
> userspace provided candidate list.
Any solution that doesn't offer a proper approach to the above issues
should not be discussed anyway. By allowing the ranking goes up to the
user space is not meant only for user testing ranking, but to keep the
OOM Killer kernel code simpler and clean. As a matter of fact, even
protected.
Consider the invocation for example. It comes in two phases with this proposal:
1) ranking for the most likely culprits only starts when memory consumption
gets close to the red zone (for example 98% or something like that).
2) killing just gets the first candidate from the list and kills it.
No need to calculate
at kernel level.
The selection problem is very dependent on the ranking algorithm. For PCs it
may not be a trouble, but for emdedded devices? yes it is. The ranking at the
kernel level uses only int type of integer. If you get the log file
for the ranking
in any embedded device you will notice that many processes end up with
the same ranking point. Thus, there will never be the best choice in this way.
By moving just the ranking to the user space fix this problem 'cause you may
use float to order PIDs with different indexes. The good side effect is that we
allow better ways of choosing the culprit by means of diffrent calculations to
meet different patterns of memory consumtion.
> Postpone this until the main problem is fixed. There is a proper
> confirmed fix for this available. It was posted more than once.
>
> Merging a fix which helps only 0,001 % of the users to hide the mess
> instead of fixing the real problem is a real interesting engineering
> aproach.
>
> I don't deny, that after the source of trouble is fixed it is worth to
> think about the merging of this addon to allow interested users to
> define the culprits instead of relying on an always imperfect selection
> algorithm.
br
Edjard
--
"In a world without fences ... who needs Gates?"
On Tue, Jan 11, 2005 at 01:35:47AM +0100, Thomas Gleixner wrote:
> confirmed fix for this available. It was posted more than once.
I posted 6 patches (1/4,2/4,3/4,4/4,5/4,6/4), they should be all
applied to mainline, they're self contained. They add the userspace
ratings too.
Those patches fixes a longstanding PF_MEMDIE race too and they optimize
used_math as well.
I'm running with all 6 patches applied with an uptime of 6 days on SMP
and no problems at all. They're all 6 patches applied to the kotd too
(plus the other bits posted on l-k as well for the write throttling,
just one bit is still missing but I'll add it soon):
ftp://ftp.suse.com/pub/projects/kernel/kotd/i386/HEAD
On Tue, 2005-01-11 at 04:03 +0200, Edjard Souza Mota wrote:
> > I have no objections against the userspace provided candidate list
> > option, but as long as the main sources of trouble
> >
> > - invocation
> > - reentrancy
> > - timed, counted, blah ugly protection
> > - selection problem
> >
> > are not fixed properly, we don't need to discuss the inclusion of a
> > userspace provided candidate list.
>
> Any solution that doesn't offer a proper approach to the above issues
> should not be discussed anyway. By allowing the ranking goes up to the
> user space is not meant only for user testing ranking, but to keep the
> OOM Killer kernel code simpler and clean. As a matter of fact, even
> protected.
>
> Consider the invocation for example. It comes in two phases with this proposal:
I consider the invocation of out_of_memory in the first place. This is
the real root of the problems. The ranking is a different playground.
Your solution does not solve
- invocation madness
- reentrancy protection
- the ugly mess of timers, counters... in out_of_memory, which aren't
neccecary at all
This must be solved first in a proper way, before we talk about ranking.
You are definitely curing the symptom instead of the cause.
> 1) ranking for the most likely culprits only starts when memory consumption
> gets close to the red zone (for example 98% or something like that).
> 2) killing just gets the first candidate from the list and kills it.
> No need to calculate
> at kernel level.
What is the default behaviour when no userspace settings are available -
Nothing ? Are you really expecting that we change every root fs in order
to be able to upgrade the kernel for solving this _kernel_ problem ?
Who is setting up those userspace constraints ? Joe User, who is barely
able to find the power on button on his box ? The sysadmin, who will
have to adjust the list for each box depending on the apps it runs or
the user who is logged into the box ?
Memory management _is_ a kernel task and so the shortage of memory has
to be handled by the kernel on its own in the first place. Adding user
space support for certain tasks is a good thing, but not a solution to
the problem itself.
> The selection problem is very dependent on the ranking algorithm. For PCs it
> may not be a trouble, but for emdedded devices? yes it is. The ranking at the
> kernel level uses only int type of integer. If you get the log file
> for the ranking
> in any embedded device you will notice that many processes end up with
> the same ranking point. Thus, there will never be the best choice in this way.
I know the constrains of embedded boxes well enough to know that there
is a bit of a difference to a desktop machine.
> By moving just the ranking to the user space fix this problem 'cause you may
> use float to order PIDs with different indexes. The good side effect is that we
> allow better ways of choosing the culprit by means of diffrent calculations to
> meet different patterns of memory consumtion.
I'm running Andrea's and my combined fixes on a couple of embedded and
desktop boxes and it has proven to be a proper in kernel solution for
the in kernel problem.
I don't argue againts the ability to provide a culprit list to the
kernel, but as I said before it only can be a optional addon to a proper
in kernel solution.
tglx
On Tue, Jan 11, 2005 at 09:44:53AM +0100, Thomas Gleixner wrote:
> I consider the invocation of out_of_memory in the first place. This is
> the real root of the problems. The ranking is a different playground.
> Your solution does not solve
> - invocation madness
> - reentrancy protection
> - the ugly mess of timers, counters... in out_of_memory, which aren't
> neccecary at all
Thomas, you're obviously right, it's not even worth discussing this.
The 6 patches I posted (and my version is the only one that includes all
the outstanding fixes) have to be applied. Than we can think about the
rest.
Rik's two patches (writeback-highmem and writeback_nr_scanned) should be
applied too since they're obviously right too (and they're completely
orthogonal with our 6). Rik's 2/2 looked more like an hack and it
shouldn't be applied.
Andrea Arcangeli <[email protected]> wrote:
>
> The 6 patches I posted
I have the original versions of these saved away but they generate a ton of
rejects now. When you sync them up to Linus's current tree could you pleae
resend them all?
On Tue, Jan 11, 2005 at 01:08:27AM -0800, Andrew Morton wrote:
> I have the original versions of these saved away but they generate a ton of
> rejects now. When you sync them up to Linus's current tree could you pleae
> resend them all?
Can I trust the kernel CVS to be uptodate? I normally use it for such
things but I'd prefer to be sure that I can trust it before risking
wasting time (it has been unstable recently and I so I was working with
patches in the meantime).
Hi,
> I consider the invocation of out_of_memory in the first place. This is
> the real root of the problems. The ranking is a different playground.
> Your solution does not solve
> - invocation madness
> - reentrancy protection
> - the ugly mess of timers, counters... in out_of_memory, which aren't
> neccecary at all
You're 100% right! It was not the purpose to solve it in the first
place, but rather
to remove the ranking algorithm out of kernel.
Hmmm, it seems you removed the selection problem from your original list,
that means we made our point. Thankyou!
> This must be solved first in a proper way, before we talk about ranking.
>
> You are definitely curing the symptom instead of the cause.
>
> > 1) ranking for the most likely culprits only starts when memory consumption
> > gets close to the red zone (for example 98% or something like that).
> > 2) killing just gets the first candidate from the list and kills it.
> > No need to calculate
> > at kernel level.
>
> What is the default behaviour when no userspace settings are available -
> Nothing ? Are you really expecting that we change every root fs in order
> to be able to upgrade the kernel for solving this _kernel_ problem ?
No, I certainly don't. But, have seen the application we also posted? It is
a test for while, that actually starts a deamon when you boot the kernel
and does rate this application, i.e. an application with root rating priority
so it will never be killed and never lack space for itself.
So, the answer to your 2nd very good point.
But as I wrote above it is an idea to be tested not to impose any
change in every
root fs.
> Who is setting up those userspace constraints ? Joe User, who is barely
> able to find the power on button on his box ? The sysadmin, who will
> have to adjust the list for each box depending on the apps it runs or
> the user who is logged into the box ?
Well from the discussion below and your reasonable argument it is clear that
we are thinking of it for embeded deveices, in the first place.
> Memory management _is_ a kernel task and so the shortage of memory has
> to be handled by the kernel on its own in the first place. Adding user
> space support for certain tasks is a good thing, but not a solution to
> the problem itself.
Yes, you're 100% right again. Sorry but, Does this mean that for everything it
manages the kernel should compute rates for PID? It seems that only tests
can show which way is more appropriate. Perhaps it may end up showing
that both ways reach the same result, but, perhaps, with different algorithm
ingenuity. We shall work on this test data and it soon. Thank you.
>
> > The selection problem is very dependent on the ranking algorithm. For PCs it
> > may not be a trouble, but for emdedded devices? yes it is. The ranking at the
> > kernel level uses only int type of integer. If you get the log file
> > for the ranking
> > in any embedded device you will notice that many processes end up with
> > the same ranking point. Thus, there will never be the best choice in this way.
>
> I know the constrains of embedded boxes well enough to know that there
> is a bit of a difference to a desktop machine.
I didn't mean you don't know. Don't it personal.
>
> > By moving just the ranking to the user space fix this problem 'cause you may
> > use float to order PIDs with different indexes. The good side effect is that we
> > allow better ways of choosing the culprit by means of diffrent calculations to
> > meet different patterns of memory consumtion.
>
> I'm running Andrea's and my combined fixes on a couple of embedded and
> desktop boxes and it has proven to be a proper in kernel solution for
> the in kernel problem.
>
> I don't argue againts the ability to provide a culprit list to the
> kernel, but as I said before it only can be a optional addon to a proper
> in kernel solution.
Back to your list of problems, we tackled just one. The selection problem. For
sure we will investigate on your solution and see a faster way to enhance ours
with with your approach. Who knows the other way around might work as well?
Thank you again
br
Edjard
>
> tglx
>
--
"In a world without fences ... who needs Gates?"
Andrea Arcangeli <[email protected]> wrote:
>
> On Tue, Jan 11, 2005 at 01:08:27AM -0800, Andrew Morton wrote:
> > I have the original versions of these saved away but they generate a ton of
> > rejects now. When you sync them up to Linus's current tree could you pleae
> > resend them all?
>
> Can I trust the kernel CVS to be uptodate?
I don't know - I've never used it. Others might know.
http://www.kernel.org/pub/linux/kernel/v2.5/testing/cset/ is always
up to date. The problematic patches were merged three days ago though.
> I normally use it for such
> things but I'd prefer to be sure that I can trust it before risking
> wasting time (it has been unstable recently and I so I was working with
> patches in the meantime).
No huge rush - any time in the next week or two would suit. I'd expect
-rc1 within the next week anwyay.
Thanks.
On Tue, 2005-01-11 at 11:20 +0200, Edjard Souza Mota wrote:
> > You are definitely curing the symptom instead of the cause.
> >
> > > 1) ranking for the most likely culprits only starts when memory consumption
> > > gets close to the red zone (for example 98% or something like that).
We do the ranking only in the oom situation, so what's your point ?
> > > 2) killing just gets the first candidate from the list and kills it.
> > > No need to calculate
> > > at kernel level.
So I need a userspace change in order to solve a kernel problem ?
> > What is the default behaviour when no userspace settings are available -
> > Nothing ? Are you really expecting that we change every root fs in order
> > to be able to upgrade the kernel for solving this _kernel_ problem ?
>
> No, I certainly don't. But, have seen the application we also posted? It is
> a test for while, that actually starts a deamon when you boot the kernel
> and does rate this application, i.e. an application with root rating priority
> so it will never be killed and never lack space for itself.
> So, the answer to your 2nd very good point.
You did not answer my question at all. I do not want to update my rootfs
to solve a problem which exists in the kernel and must be solved in the
kernel.
tglx
On Tue, Jan 11, 2005 at 10:30:46AM +0100, Thomas Gleixner wrote:
> > > > 2) killing just gets the first candidate from the list and kills it.
> > > > No need to calculate
> > > > at kernel level.
>
> So I need a userspace change in order to solve a kernel problem ?
Allowing userspace to tune is a great idea. However we cannot invoke
userland at oom-time to make the decision, or it would be deadlock prone
(userland may be swapped out or it might require minor allocations of
memory, if we were to allow userspace to do the decision it would be
required to be a mlockall userland and not allowed to do syscalls, and
even then it could mess up with the stack or signal handlers). So the
safe thing to do is to assign different ratings to different userspace
tasks. Of course this is inherited from the childs. That is a reasonable
approach IMHO. Kurt wrote that patch, I only ported it to a more recent
codebase.
This way you can rate your important services and the not important
ones.
Anyway as you've mentioned in a earlier email, there were more
fundamental problems than the selection algorithm, the userspace rating
was the lowest one in the prio list.
Hi,
> > > > 1) ranking for the most likely culprits only starts when memory consumption
> > > > gets close to the red zone (for example 98% or something like that).
>
> We do the ranking only in the oom situation, so what's your point ?
The point is that kernel doesn't need to keep watching the memory space
every time a process needs memory. Only when memory is close this red zone.
>
> > > > 2) killing just gets the first candidate from the list and kills it.
> > > > No need to calculate
> > > > at kernel level.
>
> So I need a userspace change in order to solve a kernel problem ?
You could see in another way. Release the kernel from calculating rating
so that better approaches of choosing the culprit can be proposed and
tested withoud mess too much the code.
>
> > > What is the default behaviour when no userspace settings are available -
> > > Nothing ? Are you really expecting that we change every root fs in order
> > > to be able to upgrade the kernel for solving this _kernel_ problem ?
> >
> > No, I certainly don't. But, have seen the application we also posted? It is
> > a test for while, that actually starts a deamon when you boot the kernel
> > and does rate this application, i.e. an application with root rating priority
> > so it will never be killed and never lack space for itself.
> > So, the answer to your 2nd very good point.
>
> You did not answer my question at all. I do not want to update my rootfs
> to solve a problem which exists in the kernel and must be solved in the
> kernel.
If you stick on this thought, then there is more to say, but rather send test
results soon so that you all can evaluate.
br
Edjard
--
"In a world without fences ... who needs Gates?"
Hi,
> Allowing userspace to tune is a great idea. However we cannot invoke
> userland at oom-time to make the decision, or it would be deadlock prone
> (userland may be swapped out or it might require minor allocations of
> memory, if we were to allow userspace to do the decision it would be
> required to be a mlockall userland and not allowed to do syscalls, and
> even then it could mess up with the stack or signal handlers).
Hmm, no it is not the case. The deamon application would start from the
boot. It only keeps the list of candidates whenever you're getting
close to red zone.
There is no deadlock.
Deamon just started at user space, and does only calculation. It doesn't
take decision at all. That OOM killer at kernel level who get the list
and chooses
who to shoot dead.
> So the safe thing to do is to assign different ratings to different userspace
> tasks. Of course this is inherited from the childs. That is a reasonable
> approach IMHO. Kurt wrote that patch, I only ported it to a more recent
> codebase.
Could be. Interesting idea. We shall keep thinking about it. Have you done
some experiment like that?
>
> This way you can rate your important services and the not important
> ones.
>
> Anyway as you've mentioned in a earlier email, there were more
> fundamental problems than the selection algorithm, the userspace rating
> was the lowest one in the prio list.
>
Yes, agreed. Our point was just to re-organize current OOM killer to release the
kernel from doing rating, which is not its task any way.
br
Edjard
--
"In a world without fences ... who needs Gates?"
On Tue, 2005-01-11 at 12:00 +0200, Edjard Souza Mota wrote:
> Hi,
>
> > > > > 1) ranking for the most likely culprits only starts when memory consumption
> > > > > gets close to the red zone (for example 98% or something like that).
> >
> > We do the ranking only in the oom situation, so what's your point ?
>
> The point is that kernel doesn't need to keep watching the memory space
> every time a process needs memory. Only when memory is close this red zone.
Oh, I see. The mechanism which is doing memory management must not be
aware of the resources which it is managing ? Am I missing a point ?
tglx
On Tue, 2005-01-11 at 12:05 +0200, Edjard Souza Mota wrote:
> Yes, agreed. Our point was just to re-organize current OOM killer to release the
> kernel from doing rating, which is not its task any way.
It is a kernel task and will always be a kernel task. The kernel manages
memory resources and therefor is the place which is responsible to solve
the oom situation.
The userland daemon or what ever can only be a add on to give a hint for
the final decision.
tglx
On Tue, Jan 11, 2005 at 12:05:40PM +0200, Edjard Souza Mota wrote:
> Deamon just started at user space, and does only calculation. It doesn't
> take decision at all. That OOM killer at kernel level who get the list
> and chooses
> who to shoot dead.
Then this is exactly what the oomkilladj patch from Kurt is doing. You
tune it with this:
andrea@dualathlon:~> cat /proc/self/oom_adj
0
andrea@dualathlon:~> cat /proc/self/oom_score
627
andrea@dualathlon:~>
(the second one is the score)
With this script I can tell exactly which is going to be the next killed
task if the box were to run oom:
ls /proc/*/oom_score| grep -v self | sed 's/\(.*\)\/\(.*\)/echo -n "\1 "; cat \1\/\2/'|sh | sort -nr +1| head -n 1
In this case it would be pid 4175:
/proc/4175 32923
andrea@dualathlon:~> ps 4175
PID TTY STAT TIME COMMAND
4175 ? Ss 0:03 kdeinit: Running...
andrea@dualathlon:~>
> Could be. Interesting idea. We shall keep thinking about it. Have you done
> some experiment like that?
We ship it in production, it worked so far. Though I don't know if it's
flexible as much as you need. Sure it's not going to make the oom killer
worse to have some way of tuning it ;).
If you've a better API we can discuss it, the above was quite
non-intrusive, it's simple and it does the trick so I don' dislike it.
Anyway as said in the other email, before discussing this stuff we
should get the rest fixed. There were more serious problems than the
task selection algorithm.
> Yes, agreed. Our point was just to re-organize current OOM killer to release the
> kernel from doing rating, which is not its task any way.
I believe the kernel can have an huristic that gets right 99% of cases.
But for sure the kernel *can't* always get it right, since only the
admin knows the semantics and the importance of the stuff being
computed, and the oomkilladj is there exactly to let the kernel learn
about it too. The kernel has no clue that it's going to kill the
database to leave a buggy videogame running, for the kernel all
processes are important the same unless it's being tuned by userspace
somehow.
The only thing the kernel can do is to take the best decision that will
prevent more oom killing in the future. The kernel should exclude from
the untuned selection all tasks that even if they're killed, the box
would run out of memory again. So the one task that is allocating the
memory at the fastest rate, is the best one to kill normally. The
current selection algorithm however is not taking into account the
allocation rate at all and in turn I believe the current oom killer is
quite far from the ideal oom killer. But this is a different topic, it
has nothing to do with the current patches, nor with the userland
tuning.
On Tue, Jan 11, 2005 at 01:35:47AM +0100, Thomas Gleixner wrote:
> On Mon, 2005-01-10 at 18:05 -0200, Marcelo Tosatti wrote:
> > The feature is interesting - several similar patches have been around with similar
> > functionality (people who need usually write their own, I've seen a few), but none
> > has ever been merged, even though it is an important requirement for many users.
>
> It's not a requirement for users. The current implementation in the
> kernel it's just broken, ugly code.
>
> > This is simple, an ordered list of candidate PIDs. IMO something similar to this
> > should be merged. Andrew ?
>
> I have no objections against the userspace provided candidate list
> option, but as long as the main sources of trouble
>
> - invocation
> - reentrancy
> - timed, counted, blah ugly protection
> - selection problem
>
> are not fixed properly, we don't need to discuss the inclusion of a
> userspace provided candidate list.
>
> Postpone this until the main problem is fixed. There is a proper
> confirmed fix for this available. It was posted more than once.
Agreed - haven't you and Andrea fixed those recently ?
> Merging a fix which helps only 0,001 % of the users to hide the mess
> instead of fixing the real problem is a real interesting engineering
> aproach.
>
> I don't deny, that after the source of trouble is fixed it is worth to
> think about the merging of this addon to allow interested users to
> define the culprits instead of relying on an always imperfect selection
> algorithm.
Yep.
On Mon, Jan 10, 2005 at 07:24:35PM -0400, Mauricio Lin wrote:
> On Mon, 10 Jan 2005 18:05:14 -0200, Marcelo Tosatti
> <[email protected]> wrote:
> > On Tue, Jan 11, 2005 at 12:40:24AM +0200, Edjard Souza Mota wrote:
> > > Hi,
> > >
> > > I guess it the idea was not fully and well explained. It is not the OOM Killer
> > > itself that was moved to user space but rather its ranking algorithm.
> > > Ranking is not an specific functionality of kernel space. Kernel only need
> > > to know which process whould be killed.
> > >
> > > In that sense the approach is different and might be worth testing, mainly for
> > > cases where we want to allow better policies of ranking. For example, an
> > > embedded device with few resources and important different running applications:
> > > whic one is the best? To my understanding the current ranking policy
> > > does not necessarily chooses the best one to be killed.
> >
> > Sorry, I misunderstood. Should have read the code before shouting.
> >
> > The feature is interesting - several similar patches have been around with similar
> > functionality (people who need usually write their own, I've seen a few), but none
> > has ever been merged, even though it is an important requirement for many users.
> >
> > This is simple, an ordered list of candidate PIDs. IMO something similar to this
> > should be merged. Andrew ?
> >
> > Few comments about the code:
> >
> > retry:
> > - p = select_bad_process();
> > + printk(KERN_DEBUG "A good walker leaves no tracks.\n");
> > + p = select_process();
> >
> > You want to fallback to select_bad_process() if no candidate has been selected at
> > select_process().
> The idea is turn off the select_bad_process() and the new
> select_process() will get the list of pids to be killed from
> /proc/oom. But the ranking algorithms is the same, I mean is the Rik
> van Riel algorithm. Do you think it is worthwhile to maintain the
> select_bad_process (kernel space algorithm) if we have the
> select_process() function?
Yes, if select_process() fails (in case no process is on the candidate list), i
fallbacking to select_bad_process() is important I think.
> >
> > You also want to move "oom" to /proc/sys/vm/.
>
> This can be possible. Do you think that it is a good place to move the oom?
Yep.
On Tue, Jan 11, 2005 at 09:58:03AM +0100, Andrea Arcangeli wrote:
> On Tue, Jan 11, 2005 at 09:44:53AM +0100, Thomas Gleixner wrote:
> > I consider the invocation of out_of_memory in the first place. This is
> > the real root of the problems. The ranking is a different playground.
> > Your solution does not solve
> > - invocation madness
> > - reentrancy protection
> > - the ugly mess of timers, counters... in out_of_memory, which aren't
> > neccecary at all
>
> Thomas, you're obviously right, it's not even worth discussing this.
> The 6 patches I posted (and my version is the only one that includes all
> the outstanding fixes) have to be applied. Than we can think about the
> rest.
>
> Rik's two patches (writeback-highmem and writeback_nr_scanned) should be
> applied too since they're obviously right too (and they're completely
> orthogonal with our 6). Rik's 2/2 looked more like an hack and it
> shouldn't be applied.
This patchsets should be in -mm by now? :)
On Tue, 2005-01-11 at 05:42 -0200, Marcelo Tosatti wrote:
> > are not fixed properly, we don't need to discuss the inclusion of a
> > userspace provided candidate list.
> >
> > Postpone this until the main problem is fixed. There is a proper
> > confirmed fix for this available. It was posted more than once.
>
> Agreed - haven't you and Andrea fixed those recently ?
Yep. The fixes are around for quite a while and Andrea is bringing the
fixes up to kernel current, if I understood one of his previous mails
correctly.
tglx
On Tue, Jan 11, 2005 at 11:51:38AM +0100, Thomas Gleixner wrote:
> Yep. The fixes are around for quite a while and Andrea is bringing the
> fixes up to kernel current, if I understood one of his previous mails
> correctly.
Yes, it's in my queue (unfortunately I've a few bits to do more urgently
but Andrew said it's not in a huge rush ;).
Hi,
> > Deamon just started at user space, and does only calculation. It doesn't
> > take decision at all. That OOM killer at kernel level who get the list
> > and chooses
> > who to shoot dead.
>
> Then this is exactly what the oomkilladj patch from Kurt is doing. You
> tune it with this:
>
> andrea@dualathlon:~> cat /proc/self/oom_adj
> 0
> andrea@dualathlon:~> cat /proc/self/oom_score
> 627
> andrea@dualathlon:~>
>
> (the second one is the score)
>
> With this script I can tell exactly which is going to be the next killed
> task if the box were to run oom:
>
> ls /proc/*/oom_score| grep -v self | sed 's/\(.*\)\/\(.*\)/echo -n "\1 "; cat \1\/\2/'|sh | sort -nr +1| head -n 1
>
> In this case it would be pid 4175:
>
> /proc/4175 32923
> andrea@dualathlon:~> ps 4175
> PID TTY STAT TIME COMMAND
> 4175 ? Ss 0:03 kdeinit: Running...
> andrea@dualathlon:~>
>
> > Could be. Interesting idea. We shall keep thinking about it. Have you done
> > some experiment like that?
>
> We ship it in production, it worked so far. Though I don't know if it's
> flexible as much as you need. Sure it's not going to make the oom killer
> worse to have some way of tuning it ;).
Yeap, that's what we believe and this very tiny contribution is, as
far as I can see,
orthogonal to you work and we are willing to complement it with more
experiments.
> If you've a better API we can discuss it, the above was quite
> non-intrusive, it's simple and it does the trick so I don' dislike it.
Ok. We already started checking that and we will give you a feed back soon.
> Anyway as said in the other email, before discussing this stuff we
> should get the rest fixed. There were more serious problems than the
> task selection algorithm.
Yes, indeed. I agree you and other guys on this respect. It is just
that sometimes
small stones inside our shoes cause greater damage in the long run than
the big ones we've just faced. All othe problems are like the big ones, and
we took the path to remove the small ones first.
>
> > Yes, agreed. Our point was just to re-organize current OOM killer to release the
> > kernel from doing rating, which is not its task any way.
>
> I believe the kernel can have an huristic that gets right 99% of cases.
> But for sure the kernel *can't* always get it right, since only the
> admin knows the semantics and the importance of the stuff being
> computed, and the oomkilladj is there exactly to let the kernel learn
> about it too. The kernel has no clue that it's going to kill the
> database to leave a buggy videogame running, for the kernel all
> processes are important the same unless it's being tuned by userspace
> somehow.
Ok, I realise this may look too philosophical to other folks in this list but
just for the sake of reality Linux is going to mobility nowadays. No one
denies that. What for? There are certainly better answers then mines, but
we hope to allow the freedom of choosing for the application you want to
have just on the palm of hand, for affordable prices and fair competition.
This is not new, sorry about that.
Taking this into account, who holds the semantics of the importance of
applications running but the user of a mobile device? Is it the kernel? Who is
the admin?
> The only thing the kernel can do is to take the best decision that will
> prevent more oom killing in the future. The kernel should exclude from
> the untuned selection all tasks that even if they're killed, the box
> would run out of memory again.
For sure there are certain levels of security that operators and
manufacturers wnat to have and they warn us what we can and we can't do.
Just like with easy-to-use consumer electronic products. Again, nothing new.
> So the one task that is allocating the memory at the fastest rate, is the
> best one to kill normally. The current selection algorithm however is not
> taking into account the allocation rate at all and in turn I believe the current
> oom killer is quite far from the ideal oom killer. But this is a different topic, it
> has nothing to do with the current patches, nor with the userland
> tuning.
Yes, your'e absolutely right. The userland tuning is just the part that will
allow, let's say, a user profile-based policy of ranking, and this may have
impact on such patterns of memory consuption, or rate allocation if you like.
br
Edjard
--
"In a world without fences ... who needs Gates?"
Hi folks,
i believe the OOM has been a matter of frequent discussion for some
time now. The initial implementation was considered good enough for
some, ugly and inefficient in selecting the right task to kill for
others and downright unecessary for yet more of linux users. I belong
to somewhere in between the 1st and the second group.
I personally believe that heuristics never work 100% of the time, even
when carefully thought of, as Andrea conceded. Nevertheless the
kernel needs some guidance upon which to base a decision. This
guidance can be a default, or a user-based solution using whatever
possible algorithm to do the ranking. One thing is for sure, the
kernel should be the one to do the kill ...
I believe that we need to test some more and see what should be the
default ranking approach. Certainly Andrea's implementation looks
promising, but as a user I would like to opt if I feel capable to
implement my own ranking algorithm, in a way that is controlled by
myself.
my 2 cents
--
Ilias Biris
On Tue, 11 Jan 2005 11:44:39 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Tue, Jan 11, 2005 at 12:05:40PM +0200, Edjard Souza Mota wrote:
> > Deamon just started at user space, and does only calculation. It doesn't
> > take decision at all. That OOM killer at kernel level who get the list
> > and chooses
> > who to shoot dead.
>
> Then this is exactly what the oomkilladj patch from Kurt is doing. You
> tune it with this:
>
> andrea@dualathlon:~> cat /proc/self/oom_adj
> 0
> andrea@dualathlon:~> cat /proc/self/oom_score
> 627
> andrea@dualathlon:~>
>
> (the second one is the score)
>
> With this script I can tell exactly which is going to be the next killed
> task if the box were to run oom:
>
> ls /proc/*/oom_score| grep -v self | sed 's/\(.*\)\/\(.*\)/echo -n "\1 "; cat \1\/\2/'|sh | sort -nr +1| head -n 1
>
> In this case it would be pid 4175:
>
> /proc/4175 32923
> andrea@dualathlon:~> ps 4175
> PID TTY STAT TIME COMMAND
> 4175 ? Ss 0:03 kdeinit: Running...
> andrea@dualathlon:~>
>
> > Could be. Interesting idea. We shall keep thinking about it. Have you done
> > some experiment like that?
>
> We ship it in production, it worked so far. Though I don't know if it's
> flexible as much as you need. Sure it's not going to make the oom killer
> worse to have some way of tuning it ;).
>
> If you've a better API we can discuss it, the above was quite
> non-intrusive, it's simple and it does the trick so I don' dislike it.
>
> Anyway as said in the other email, before discussing this stuff we
> should get the rest fixed. There were more serious problems than the
> task selection algorithm.
>
> > Yes, agreed. Our point was just to re-organize current OOM killer to release the
> > kernel from doing rating, which is not its task any way.
>
> I believe the kernel can have an huristic that gets right 99% of cases.
> But for sure the kernel *can't* always get it right, since only the
> admin knows the semantics and the importance of the stuff being
> computed, and the oomkilladj is there exactly to let the kernel learn
> about it too. The kernel has no clue that it's going to kill the
> database to leave a buggy videogame running, for the kernel all
> processes are important the same unless it's being tuned by userspace
> somehow.
>
> The only thing the kernel can do is to take the best decision that will
> prevent more oom killing in the future. The kernel should exclude from
> the untuned selection all tasks that even if they're killed, the box
> would run out of memory again. So the one task that is allocating the
> memory at the fastest rate, is the best one to kill normally. The
> current selection algorithm however is not taking into account the
> allocation rate at all and in turn I believe the current oom killer is
> quite far from the ideal oom killer. But this is a different topic, it
> has nothing to do with the current patches, nor with the userland
> tuning.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
On Maw, 2005-01-11 at 08:44, Thomas Gleixner wrote:
> I consider the invocation of out_of_memory in the first place. This is
> the real root of the problems. The ranking is a different playground.
> Your solution does not solve
> - invocation madness
> - reentrancy protection
> - the ugly mess of timers, counters... in out_of_memory, which aren't
> neccecary at all
>
> This must be solved first in a proper way, before we talk about ranking.
echo "2" >/proc/sys/vm/overcommit_memory
End of problem (except for extreme cases) and with current 2.6.10-bk
(and -ac because I pulled the patch back into -ac) also for most extreme
cases as Andries pre-reserves the stack address spaces.
Hi
where I come from we say (jokingly of course) 'got a headache? chop
your own head ... end of problem'.
Though your system is not guaranteed to become more stable. When you
forbid overcommitting memory, all you do is make failure occur for ALL
processes at a different time. A process is happily doing something
useful when all of a sudden its fork may die due to 'out of memory'
... Moreover shutting down overcommit will do that for all processes,
not just the one culprit that could be chopped off by oom...
Maybe it is just me but I think with overcommiting a system works more
reliably :-)
On Tue, 11 Jan 2005 16:32:23 +0000, Alan Cox <[email protected]> wrote:
> On Maw, 2005-01-11 at 08:44, Thomas Gleixner wrote:
> > I consider the invocation of out_of_memory in the first place. This is
> > the real root of the problems. The ranking is a different playground.
> > Your solution does not solve
> > - invocation madness
> > - reentrancy protection
> > - the ugly mess of timers, counters... in out_of_memory, which aren't
> > neccecary at all
> >
> > This must be solved first in a proper way, before we talk about ranking.
>
> echo "2" >/proc/sys/vm/overcommit_memory
>
> End of problem (except for extreme cases) and with current 2.6.10-bk
> (and -ac because I pulled the patch back into -ac) also for most extreme
> cases as Andries pre-reserves the stack address spaces.
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
Ilias Biris
On Tue, 2005-01-11 at 16:32 +0000, Alan Cox wrote:
> On Maw, 2005-01-11 at 08:44, Thomas Gleixner wrote:
> > I consider the invocation of out_of_memory in the first place. This is
> > the real root of the problems. The ranking is a different playground.
> > Your solution does not solve
> > - invocation madness
> > - reentrancy protection
> > - the ugly mess of timers, counters... in out_of_memory, which aren't
> > neccecary at all
> >
> > This must be solved first in a proper way, before we talk about ranking.
>
> echo "2" >/proc/sys/vm/overcommit_memory
>
> End of problem (except for extreme cases) and with current 2.6.10-bk
> (and -ac because I pulled the patch back into -ac) also for most extreme
> cases as Andries pre-reserves the stack address spaces.
Maybe for you it's end of problem, but I still can reproduce the weird
behaviour with 2.6.10-bk with the tests I posted before. I don't buy end
of problem proclamations, as long as I can proove the contrary.
The only working patch so far is Andrea's fix of the invocation, which
also solves the reentrancy problem, gets rid of the timer,counter hack
and my small contribution to the selection algorithm.
tglx
well looking into Alan's email again I think I answered thinking on
the wrong side :-) that the suggestion was to switch off OOM
altogether and be done with all the discussion... tsk tsk tsk too
defensive and hasty I guess :-)
Thinking it in another way alan's email could have the dimension of
switching off overcommitment (and thus OOM) whilst in the user-space
ranking stage to avoid reentrancy and invocation of oom again and
again before killing something. It also solves the issue of using
timed/counted resources which is plain ugly and evil. It would though
be necessary to switch OOM back on when the OOMK has finally done the
kill.
Did I get it right this time Alan?
On Tue, 11 Jan 2005 15:16:04 -0400, Ilias Biris <[email protected]> wrote:
> Hi
>
> where I come from we say (jokingly of course) 'got a headache? chop
> your own head ... end of problem'.
>
> Though your system is not guaranteed to become more stable. When you
> forbid overcommitting memory, all you do is make failure occur for ALL
> processes at a different time. A process is happily doing something
> useful when all of a sudden its fork may die due to 'out of memory'
> ... Moreover shutting down overcommit will do that for all processes,
> not just the one culprit that could be chopped off by oom...
>
> Maybe it is just me but I think with overcommiting a system works more
> reliably :-)
>
> On Tue, 11 Jan 2005 16:32:23 +0000, Alan Cox <[email protected]> wrote:
> > On Maw, 2005-01-11 at 08:44, Thomas Gleixner wrote:
> > > I consider the invocation of out_of_memory in the first place. This is
> > > the real root of the problems. The ranking is a different playground.
> > > Your solution does not solve
> > > - invocation madness
> > > - reentrancy protection
> > > - the ugly mess of timers, counters... in out_of_memory, which aren't
> > > neccecary at all
> > >
> > > This must be solved first in a proper way, before we talk about ranking.
> >
> > echo "2" >/proc/sys/vm/overcommit_memory
> >
> > End of problem (except for extreme cases) and with current 2.6.10-bk
> > (and -ac because I pulled the patch back into -ac) also for most extreme
> > cases as Andries pre-reserves the stack address spaces.
> >
> > -
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
> >
>
>
> --
> Ilias Biris
>
--
Ilias Biris
On Tue, 2005-01-11 at 16:46 -0400, Ilias Biris wrote:
> well looking into Alan's email again I think I answered thinking on
> the wrong side :-) that the suggestion was to switch off OOM
> altogether and be done with all the discussion... tsk tsk tsk too
> defensive and hasty I guess :-)
>
> Thinking it in another way alan's email could have the dimension of
> switching off overcommitment (and thus OOM) whilst in the user-space
> ranking stage to avoid reentrancy and invocation of oom again and
> again before killing something. It also solves the issue of using
> timed/counted resources which is plain ugly and evil. It would though
> be necessary to switch OOM back on when the OOMK has finally done the
> kill.
>
> Did I get it right this time Alan?
I don't get it at all.
Fixes for wrong invocation, reentrancy avoidance, removal of the ugly
and evil timer,counter hacks are in the wild since more than 6 weeks.
They solve the problem without any userspace interaction.
The userspace provided preferrable victim list is an improvement of the
generic heuristic and therefor imperfect selection mechanism and nothing
else.
tglx
On Tuesday 11 January 2005 21:16, Ilias Biris wrote:
> Hi
>
> where I come from we say (jokingly of course) 'got a headache? chop
> your own head ... end of problem'.
>
> Though your system is not guaranteed to become more stable. When you
> forbid overcommitting memory, all you do is make failure occur for ALL
> processes at a different time. A process is happily doing something
> useful when all of a sudden its fork may die due to 'out of memory'
Application which does not check fork, malloc, etc for
success is buggy.
--
vda
Hi,
On Tue, 11 Jan 2005 21:57:49 +0100, Thomas Gleixner <[email protected]> wrote:
> On Tue, 2005-01-11 at 16:46 -0400, Ilias Biris wrote:
> > well looking into Alan's email again I think I answered thinking on
> > the wrong side :-) that the suggestion was to switch off OOM
> > altogether and be done with all the discussion... tsk tsk tsk too
> > defensive and hasty I guess :-)
> >
> > Thinking it in another way alan's email could have the dimension of
> > switching off overcommitment (and thus OOM) whilst in the user-space
> > ranking stage to avoid reentrancy and invocation of oom again and
> > again before killing something. It also solves the issue of using
> > timed/counted resources which is plain ugly and evil. It would though
> > be necessary to switch OOM back on when the OOMK has finally done the
> > kill.
> >
> > Did I get it right this time Alan?
>
> I don't get it at all.
>
> Fixes for wrong invocation, reentrancy avoidance, removal of the ugly
> and evil timer,counter hacks are in the wild since more than 6 weeks.
> They solve the problem without any userspace interaction.
>
> The userspace provided preferrable victim list is an improvement of the
> generic heuristic and therefor imperfect selection mechanism and nothing
> else.
Once again the user space ranking of PIDs for OOM killer purposes didn't
propose a new selection mechanism. This misinterpretation is misleading the
discussion of whether it may have som use in embedded devices or not.
Even though, I believe that for PCs environment it has a great potential
when you think that admins don't stay all the time in front of a computer.
Once and while they also have a rest :). In these cases, instead of simply
start ranking a deamon could dispatch a msg to her/him saying the system
is approaching a red zone.
br,
Edjard
--
"In a world without fences ... who needs Gates?"
On Wed, 2005-01-12 at 11:31 +0200, Edjard Souza Mota wrote:
> propose a new selection mechanism. This misinterpretation is misleading the
> discussion of whether it may have som use in embedded devices or not.
> Even though, I believe that for PCs environment it has a great potential
> when you think that admins don't stay all the time in front of a computer.
> Once and while they also have a rest :). In these cases, instead of simply
> start ranking a deamon could dispatch a msg to her/him saying the system
> is approaching a red zone.
Pretty intriguing. Maybe the daemon waits for the answer mail then,
where the admin confirms to select the daemon as the preferrable victim
of oom.
tglx
On Wed, 12 Jan 2005 12:19:23 +0100, Thomas Gleixner <[email protected]> wrote:
> On Wed, 2005-01-12 at 11:31 +0200, Edjard Souza Mota wrote:
> > propose a new selection mechanism. This misinterpretation is misleading the
> > discussion of whether it may have som use in embedded devices or not.
> > Even though, I believe that for PCs environment it has a great potential
> > when you think that admins don't stay all the time in front of a computer.
> > Once and while they also have a rest :). In these cases, instead of simply
> > start ranking a deamon could dispatch a msg to her/him saying the system
> > is approaching a red zone.
>
> Pretty intriguing. Maybe the daemon waits for the answer mail then,
> where the admin confirms to select the daemon as the preferrable victim
> of oom.
>
> tglx
Hi,
Not worth commenting this "intriguing" idea.
Let's wait test results and Alan's answers to Ilias interpretation.
br
Edjard
--
"In a world without fences ... who needs Gates?"
On Maw, 2005-01-11 at 20:46, Ilias Biris wrote:
> well looking into Alan's email again I think I answered thinking on
> the wrong side :-) that the suggestion was to switch off OOM
> altogether and be done with all the discussion... tsk tsk tsk too
> defensive and hasty I guess :-)
Thats what mode 2 is all about. There are some problems with over-early
triggering of OOM that Andrea fixed that are still relevant (or stick
"never OOM if mode == 2" into your kernel)
> Did I get it right this time Alan?
Basically yes - the real problem with the OOM situation is there is no
correct answer. People have spent years screwing around with the OOM
killer selection logic and while you can make it pick large tasks or old
tasks or growing tasks easily nobody has a good heuristic about what to
die because it depends on the users wishes. OOM requires AF_TELEPATHY
sockets and we don't have them.
For most users simply not allowing the mess to occur solves the problem
- not all but most.
Hi,
Thanks Alan...
> > well looking into Alan's email again I think I answered thinking on
> > the wrong side :-) that the suggestion was to switch off OOM
> > altogether and be done with all the discussion... tsk tsk tsk too
> > defensive and hasty I guess :-)
>
> Thats what mode 2 is all about. There are some problems with over-early
> triggering of OOM that Andrea fixed that are still relevant (or stick
> "never OOM if mode == 2" into your kernel)
>
> > Did I get it right this time Alan?
>
> Basically yes - the real problem with the OOM situation is there is no
> correct answer. People have spent years screwing around with the OOM
> killer selection logic and while you can make it pick large tasks or old
> tasks or growing tasks easily nobody has a good heuristic about what to
> die because it depends on the users wishes. OOM requires AF_TELEPATHY
> sockets and we don't have them.
>
>
> For most users simply not allowing the mess to occur solves the problem
> - not all but most.
>
What do you think about the point we are trying to make, i.e., moving the
ranking of PIDs to be killed to user space? Or, making user have some influence
on it? We were misunderstood because the patch we sent was to make "a slight"
organization in the way OOM killer compute rates to PIDs, not to change its
selection logic. But now, we can discuss (I mean implement)
alternative selection
logics without messing the code at kernel space. The parameters and
criteria on how
to combine them can be open to more people test it according to platform and, if
not user, at least according to application memory consumpition pattern.
Well, while AF_TELEPATH socket is not on its way :) ... we may at
least experiment
different raking policies.
br
Edard
--
"In a world without fences ... who needs Gates?"
Edjard Souza Mota wrote:
> What do you think about the point we are trying to make, i.e., moving the
> ranking of PIDs to be killed to user space?
If my system needs the OOM killer, it's usurally unresponsive to most
userspace applications. A normal daemon would be swapped out before the
runaway dhcpd grows larger than the web cache. It would have to be a mlocked
RT task started from early userspace. It would be difficult to set up (unless
you upgrade your distro), and almost nobody will feel like tweaking it to
take the benefit (OOM == -ECANNOTHAPPEN).
What about creating a linked list of (stackable) algorhithms which can be
extended by loading modules and resorted using {proc,sys}fs? It will avoid
the extra process, the extra CPU time (and task switches) to frequently
update the list and I think it will decrease the typical amount of used
memory, too.
On Sul, 2005-01-16 at 10:06, Edjard Souza Mota wrote:
> What do you think about the point we are trying to make, i.e., moving the
> ranking of PIDs to be killed to user space? Or, making user have some influence
> on it? We were misunderstood because the patch we sent was to make "a slight"
> organization in the way OOM killer compute rates to PIDs, not to change its
Im sceptical there is an answer but moving it to user space (or at least
implementing /proc tunables in user space to experiment) certainly seems
to be the right way to find out.
> Well, while AF_TELEPATH socket is not on its way :) ... we may at
> least experiment
> different raking policies.
agreed
On Sun, 2005-01-16 at 21:10 +0000, Alan Cox wrote:
> On Sul, 2005-01-16 at 10:06, Edjard Souza Mota wrote:
> > What do you think about the point we are trying to make, i.e., moving the
> > ranking of PIDs to be killed to user space? Or, making user have some influence
> > on it? We were misunderstood because the patch we sent was to make "a slight"
> > organization in the way OOM killer compute rates to PIDs, not to change its
>
> Im sceptical there is an answer but moving it to user space (or at least
> implementing /proc tunables in user space to experiment) certainly seems
> to be the right way to find out.
No objections against an userspace tuning mechanism, but I still doubt
that replacing the always imperfect in kernel selection completely is
feasable.
tglx
Hi,
> If my system needs the OOM killer, it's usurally unresponsive to most
> userspace applications. A normal daemon would be swapped out before the
> runaway dhcpd grows larger than the web cache. It would have to be a mlocked
> RT task started from early userspace. It would be difficult to set up (unless
> you upgrade your distro), and almost nobody will feel like tweaking it to
> take the benefit (OOM == -ECANNOTHAPPEN).
Please correct me if I got it wrong: as deamon in this case is not a normal one,
since it never gets rate for its own safety, then it needs an RT lock whenever
system boots.
> What about creating a linked list of (stackable) algorhithms which can be
> extended by loading modules and resorted using {proc,sys}fs? It will avoid
> the extra process, the extra CPU time (and task switches) to frequently
> update the list and I think it will decrease the typical amount of used
> memory, too.
Wouldn't this bring the (set of ) ranking algorithm(s) back to the kernel? This
is exactly what we're trying to avoid. The way we see the potential for doing
this is that kernel shouldn't worry about users decision on which process to
kill but rather take her/his option into account. The computation of such a
decision could be at user space (protected as you suggested above).
We'll think about it, although I'm not sure if there would be such a decrease
in memory concumption.
br
Edjard
--
"In a world without fences ... who needs Gates?"
On Tue, 18 Jan 2005, Edjard Souza Mota wrote:
> > If my system needs the OOM killer, it's usurally unresponsive to most
> > userspace applications. A normal daemon would be swapped out before the
> > runaway dhcpd grows larger than the web cache. It would have to be a mlocked
> > RT task started from early userspace. It would be difficult to set up (unless
> > you upgrade your distro), and almost nobody will feel like tweaking it to
> > take the benefit (OOM == -ECANNOTHAPPEN).
>
> Please correct me if I got it wrong: as deamon in this case is not a normal one,
> since it never gets rate for its own safety,
That's it's own task, it must make sure not to commit suicide. I forgot
about that.
> then it needs an RT lock whenever
> system boots.
It may not be blocked by a random RT task iff the RT task is supposed to
be OOM-killed. Therefore it *MUST* run at the highest priority and be
locked into the RAM.
It *SHOULD* be run at boot time, too, just in case it's needed early.
> > What about creating a linked list of (stackable) algorhithms which can be
> > extended by loading modules and resorted using {proc,sys}fs? It will avoid
> > the extra process, the extra CPU time (and task switches) to frequently
> > update the list and I think it will decrease the typical amount of used
> > memory, too.
>
> Wouldn't this bring the (set of ) ranking algorithm(s) back to the kernel? This
> is exactly what we're trying to avoid.
You're trying to avoid it in order to let admins try other ranking
algorhithms (at least that's what I read). The module approach seems to be
flexible enough to do that, and it avoids the mentioned issues. If you
really want a userspace daemon, it can be controled by a module.-)
I 'm thinking of something like that:
[X] support stacking of OOM killer ranking algorhythms
[X] Task blessing OOM filter
[X] Userspace OOM ranking daemon
[X] Default OOM killer ranking
-vs-
[ ] support stacking of OOM killer ranking algorhythms
( ) Userspace OOM ranking daemon
(o) Default OOM killer ranking
--
Exceptions prove the rule, and destroy the battle plan.
> > > If my system needs the OOM killer, it's usurally unresponsive to most
> > > userspace applications. A normal daemon would be swapped out before the
> > > runaway dhcpd grows larger than the web cache. It would have to be a mlocked
> > > RT task started from early userspace. It would be difficult to set up (unless
> > > you upgrade your distro), and almost nobody will feel like tweaking it to
> > > take the benefit (OOM == -ECANNOTHAPPEN).
> >
> > Please correct me if I got it wrong: as deamon in this case is not a normal one,
> > since it never gets rate for its own safety,
>
> That's it's own task, it must make sure not to commit suicide. I forgot
> about that.
Ok.
> > then it needs an RT lock whenever
> > system boots.
>
> It may not be blocked by a random RT task iff the RT task is supposed to
> be OOM-killed. Therefore it *MUST* run at the highest priority and be
> locked into the RAM.
>
> It *SHOULD* be run at boot time, too, just in case it's needed early.
Yes. That's the idea of the application we posted to test the oom
killer ranking at
user space. At least, we are working to put it at boot time and these
suggestions are very helpful.
> > > What about creating a linked list of (stackable) algorhithms which can be
> > > extended by loading modules and resorted using {proc,sys}fs? It will avoid
> > > the extra process, the extra CPU time (and task switches) to frequently
> > > update the list and I think it will decrease the typical amount of used
> > > memory, too.
> >
> > Wouldn't this bring the (set of ) ranking algorithm(s) back to the kernel? This
> > is exactly what we're trying to avoid.
>
> You're trying to avoid it in order to let admins try other ranking
> algorhithms (at least that's what I read). The module approach seems to be
> flexible enough to do that, and it avoids the mentioned issues. If you
> really want a userspace daemon, it can be controled by a module.-)
Yes, your reading is correct, but this choice should take into account
the "patterns"
of how memory is allocated for user's mostly used applications. Why?
The closer the
ranking gets to "The Best choice" the longer it will take to invoke
oom killer again.
I am wondering how could a module control a user space deamon if it
hasn't started
yet? I mean, processes at user space are supposed to start only after
all modules
are loaded (those loadable at boot time). So, this user space deamon
would break
this standard. But if we manage to have a special module that takes
care of loading
this stack of OOM Killer ranking algorithms, then the deamon would
not need to break
the default order of loading modules. The init could be changed to
start the deamon,
and then the module would start controlling it. Am I right?
So that's why people is complaining every distro would have to update the init
and load this new module. Correct?
>
> I 'm thinking of something like that:
>
> [X] support stacking of OOM killer ranking algorhythms
> [X] Task blessing OOM filter
> [X] Userspace OOM ranking daemon
> [X] Default OOM killer ranking
>
> -vs-
>
> [ ] support stacking of OOM killer ranking algorhythms
> ( ) Userspace OOM ranking daemon
> (o) Default OOM killer ranking
>
Very interesting idea. Will take that into account. Thanks a lot.
--
"In a world without fences ... who needs Gates?"
On Thu, 20 Jan 2005, Edjard Souza Mota wrote:
> > > > What about creating a linked list of (stackable) algorhithms which can be
> > > > extended by loading modules and resorted using {proc,sys}fs? It will avoid
> > > > the extra process, the extra CPU time (and task switches) to frequently
> > > > update the list and I think it will decrease the typical amount of used
> > > > memory, too.
> > >
> > > Wouldn't this bring the (set of ) ranking algorithm(s) back to the kernel? This
> > > is exactly what we're trying to avoid.
> >
> > You're trying to avoid it in order to let admins try other ranking
> > algorhithms (at least that's what I read). The module approach seems to be
> > flexible enough to do that, and it avoids the mentioned issues. If you
> > really want a userspace daemon, it can be controled by a module.-)
>
> Yes, your reading is correct, but this choice should take into account
> the "patterns"
> of how memory is allocated for user's mostly used applications. Why?
> The closer the
> ranking gets to "The Best choice" the longer it will take to invoke
> oom killer again.
ACK.
> I am wondering how could a module control a user space deamon if it
> hasn't started
> yet? I mean, processes at user space are supposed to start only after
> all modules
> are loaded (those loadable at boot time). So, this user space deamon
> would break
> this standard. But if we manage to have a special module that takes
> care of loading
> this stack of OOM Killer ranking algorithms, then the deamon would
> not need to break
> the default order of loading modules.
I don't think there neeeds to be a special order while loading the
modules, since each module will provide a defined interface which can be
registered in a linked list and sorted on demand. Just init all
compiled-in modules and sort using a kernel-parameter (remembering
modprobe might be fubar), then modprobe (if compiled-in) all missing
decision modules from the list (appending them) and resort again.
If the admin wants to add a module later, he can also change the order
again, possibly after configuring the module. Disabeling may be either
done by moving a decision past one without fall-through or by using a
seperate list.
There will be a need for a controling instance which will build a list of
candidates and pass it to each decision module in turn untill the victim
is found. Maybe the list will need a field for a ranking offset and a
scaling factor if a module is not supposed to do the final decision but to
modify the ranking for some blessed processes.
> The init could be changed to
> start the deamon,
> and then the module would start controlling it. Am I right?
It can, but it should be run from the (possibly autogenerated)
initr{d,amfs} if it's used.
> So that's why people is complaining every distro would have to update the init
> and load this new module. Correct?
ACK. (It's just me - for now)
Upgrading kernels used to be a drop-in replacement, except for ISDN and
(for 2.4 -> 2.6) v4l. I like it that way.
--
Top 100 things you don't want the sysadmin to say:
66. What do you mean you needed that directory?
Fri?, Spammer: [email protected] [email protected] [email protected]
Hi Andrea,
I applied your patch and I am checking your code. It is really a very
interesting work. I have a question about the function
__set_current_state(TASK_INTERRUPTIBLE) you put in out_of_memory
function. Do not you think it would be better put set_current_state
instead of __set_current_state function? AFAIK the set_current_state
function is more feasible for SMP systems, right?
BR,
Mauricio Lin.
On Tue, 11 Jan 2005 09:38:37 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Tue, Jan 11, 2005 at 01:35:47AM +0100, Thomas Gleixner wrote:
> > confirmed fix for this available. It was posted more than once.
>
> I posted 6 patches (1/4,2/4,3/4,4/4,5/4,6/4), they should be all
> applied to mainline, they're self contained. They add the userspace
> ratings too.
>
> Those patches fixes a longstanding PF_MEMDIE race too and they optimize
> used_math as well.
>
> I'm running with all 6 patches applied with an uptime of 6 days on SMP
> and no problems at all. They're all 6 patches applied to the kotd too
> (plus the other bits posted on l-k as well for the write throttling,
> just one bit is still missing but I'll add it soon):
>
> ftp://ftp.suse.com/pub/projects/kernel/kotd/i386/HEAD
>
>
Hi Andrew,
I have another question. You included an oom_adj entry in /proc for
each process. This was the approach you used in order to allow someone
or something to interfere the ranking algorithm from userland, right?
So if i have an another ranking algorithm in user space, I can use it
to complement the kernel decision as necessary. Was it your idea?
BR,
Mauricio Lin.
On Fri, 21 Jan 2005 17:27:11 -0400, Mauricio Lin <[email protected]> wrote:
> Hi Andrea,
>
> I applied your patch and I am checking your code. It is really a very
> interesting work. I have a question about the function
> __set_current_state(TASK_INTERRUPTIBLE) you put in out_of_memory
> function. Do not you think it would be better put set_current_state
> instead of __set_current_state function? AFAIK the set_current_state
> function is more feasible for SMP systems, right?
>
> BR,
>
> Mauricio Lin.
>
>
> On Tue, 11 Jan 2005 09:38:37 +0100, Andrea Arcangeli <[email protected]> wrote:
> > On Tue, Jan 11, 2005 at 01:35:47AM +0100, Thomas Gleixner wrote:
> > > confirmed fix for this available. It was posted more than once.
> >
> > I posted 6 patches (1/4,2/4,3/4,4/4,5/4,6/4), they should be all
> > applied to mainline, they're self contained. They add the userspace
> > ratings too.
> >
> > Those patches fixes a longstanding PF_MEMDIE race too and they optimize
> > used_math as well.
> >
> > I'm running with all 6 patches applied with an uptime of 6 days on SMP
> > and no problems at all. They're all 6 patches applied to the kotd too
> > (plus the other bits posted on l-k as well for the write throttling,
> > just one bit is still missing but I'll add it soon):
> >
> > ftp://ftp.suse.com/pub/projects/kernel/kotd/i386/HEAD
> >
> >
>
On Fri, Jan 21, 2005 at 05:27:11PM -0400, Mauricio Lin wrote:
> Hi Andrea,
>
> I applied your patch and I am checking your code. It is really a very
> interesting work. I have a question about the function
> __set_current_state(TASK_INTERRUPTIBLE) you put in out_of_memory
> function. Do not you think it would be better put set_current_state
> instead of __set_current_state function? AFAIK the set_current_state
> function is more feasible for SMP systems, right?
set_current_state is needed only when you need to place a memory barrier
after __set_current_state. So it's needed in the usual wait_event loop,
right after registering in the waitqueue. Example:
unsigned long flags;
wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
/*
* don't alter the task state if this is just going to
* queue an async wait queue callback
*/
if (is_sync_wait(wait))
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
and even in the above is needed only because spin_unlock has inclusive
semantics in ia64. In 2.4 there was no unlock at all after
set_current_state and it was like this:
set_current_state(TASK_UNINTERRUPTIBLE);
\
if (condition)
\
break;
\
schedule();
\
The rule of thumb is that if there's nothing between set_current_state
and schedule() then __set_current_state is more efficient and equally
safe to use. And the oom killer path I posted falls in this category,
nothing in between set_current_state and schedule, so no reason to place
memory barries in there.
Hope this helps ;)
On Fri, Jan 21, 2005 at 05:45:13PM -0400, Mauricio Lin wrote:
> Hi Andrew,
>
> I have another question. You included an oom_adj entry in /proc for
> each process. This was the approach you used in order to allow someone
> or something to interfere the ranking algorithm from userland, right?
> So if i have an another ranking algorithm in user space, I can use it
> to complement the kernel decision as necessary. Was it your idea?
Yes, you should use your userspace algorithm to tune the oom killer via
the oom_adj and you can check the effect of your changes with oom_score.
I posted a one liner ugly script to do that a few days ago on l-k.
The oom_adj has this effect on the badness() code:
/*
* Adjust the score by oomkilladj.
*/
if (p->oomkilladj) {
if (p->oomkilladj > 0)
points <<= p->oomkilladj;
else
points >>= -(p->oomkilladj);
}
The biggest the points become, the more likely the task will be choosen
by the oom killer.
Hi Andrea,
Your OOM Killer patch was tested and a strange behaviour was found.
Basically as normal user we started some applications as openoffice,
mozilla and emacs.
And as a root (in another tty) we started a simple program that uses
malloc in a forever loop as below:
int main (void)
{
int * mem;
for (;;)
mem = (int *) malloc(sizeof(int));
return 0;
}
Using the original OOM Killer, malloc is the first killed application
and the sytem is restored in a useful state. After applying your patch
and accomplish the same experiment, the OOM Killer it does not kill
malloc program and it enters in a kind of forever loop as below:
1) out_of_memory is invoked;
2) select_bad_process is invoked;
3) the following condition is fullfied;
if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags &
PF_EXITING)) &&
!(p->flags & PF_DEAD))
return ERR_PTR(-1UL);
4) step 1, 2 ,3 above is executed again;
This loop (step 1 until step 4) lasts during a long time (and nothing
is killed) until I give up and reboot the system after waiting for
some minutes.
Any comments? What do you think about our test case? Could you
accomplish the same test case using malloc program as root and other
graphical applications as normal user?
Let me know about your ideas.
BR,
Mauricio Lin.
On Sat, 22 Jan 2005 04:32:19 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Fri, Jan 21, 2005 at 05:45:13PM -0400, Mauricio Lin wrote:
> > Hi Andrew,
> >
> > I have another question. You included an oom_adj entry in /proc for
> > each process. This was the approach you used in order to allow someone
> > or something to interfere the ranking algorithm from userland, right?
> > So if i have an another ranking algorithm in user space, I can use it
> > to complement the kernel decision as necessary. Was it your idea?
>
> Yes, you should use your userspace algorithm to tune the oom killer via
> the oom_adj and you can check the effect of your changes with oom_score.
> I posted a one liner ugly script to do that a few days ago on l-k.
>
> The oom_adj has this effect on the badness() code:
>
> /*
> * Adjust the score by oomkilladj.
> */
> if (p->oomkilladj) {
> if (p->oomkilladj > 0)
> points <<= p->oomkilladj;
> else
> points >>= -(p->oomkilladj);
> }
>
> The biggest the points become, the more likely the task will be choosen
> by the oom killer.
>
On Tue, 2005-01-25 at 17:13 -0400, Mauricio Lin wrote:
> Hi Andrea,
>
> Your OOM Killer patch was tested and a strange behaviour was found.
> Basically as normal user we started some applications as openoffice,
> mozilla and emacs.
> And as a root (in another tty) we started a simple program that uses
> malloc in a forever loop as below:
>
> int main (void)
> {
> int * mem;
> for (;;)
> mem = (int *) malloc(sizeof(int));
> return 0;
> }
>
>
> Using the original OOM Killer, malloc is the first killed application
> and the sytem is restored in a useful state. After applying your patch
> and accomplish the same experiment, the OOM Killer it does not kill
> malloc program and it enters in a kind of forever loop as below:
>
> 1) out_of_memory is invoked;
> 2) select_bad_process is invoked;
Which process is selected ?
> 3) the following condition is fullfied;
> if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags &
> PF_EXITING)) &&
> !(p->flags & PF_DEAD))
> return ERR_PTR(-1UL);
???
Can you please show the kernel messages ?
tglx
Hi Thomaz,
On Tue, 25 Jan 2005 22:39:39 +0100, Thomas Gleixner <[email protected]> wrote:
> On Tue, 2005-01-25 at 17:13 -0400, Mauricio Lin wrote:
> > Hi Andrea,
> >
> > Your OOM Killer patch was tested and a strange behaviour was found.
> > Basically as normal user we started some applications as openoffice,
> > mozilla and emacs.
> > And as a root (in another tty) we started a simple program that uses
> > malloc in a forever loop as below:
> >
> > int main (void)
> > {
> > int * mem;
> > for (;;)
> > mem = (int *) malloc(sizeof(int));
> > return 0;
> > }
> >
> >
> > Using the original OOM Killer, malloc is the first killed application
> > and the sytem is restored in a useful state. After applying your patch
> > and accomplish the same experiment, the OOM Killer it does not kill
> > malloc program and it enters in a kind of forever loop as below:
> >
> > 1) out_of_memory is invoked;
> > 2) select_bad_process is invoked;
>
> Which process is selected ?
Sometimes the first application to be killed is XFree. AFAIK the
malloc is never killed, because the OOM Killer does not stop to do its
work. Usually we are not able to check the kernel log file after
rebooting the system. Because nothing was written there (perhaps
syslogd or klogd were killed during OOM). But I can see the printk
messages on the screen during OOM Killer action. This does not happen
with original OOM Killer.
I put some printk in order to trace the OOM Killer and IMHO what is going is:
out_of_memory function is invoked and after that the
select_bad_process is also invoked.
So its starts to point each task. But during the do_each_thread /
while each_thread loop the
condition:
if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags &
PF_EXITING)) &&
!(p->flags & PF_DEAD))
return ERR_PTR(-1UL);
is true and it leaves from select_bad_process function because of the
return statement.
So the running code return from the point that select_bad_process was
called, i.e., in the out_of_memory function. The condition statement
in out_of_memory function:
if (PTR_ERR(p) == -1UL)
goto out;
is also true so it goes to "out" label and leaves from the
out_of_memory function. But because of the OOM state the out_of_memory
function is invoked again and after that the select_bad_process is
also invoked again. And during the do_each_thread / while each_thread
loop the same condition as mentioned above is true again. So it leaves
from select_bad_process function because of the return statement and
goes to "out" label and
leaves from the out_of_memory function again. This behaviour is
repeated continuously
during a long time until I stop waiting and reboot the system using my
own finger.
> Can you please show the kernel messages ?
OK. We will try to reach a situation that the printk messages can be
written entirely in the log file and show you the kernel messages. But
as I said: usually the printks messages are not written in the log
file using Andrea's patch. But using the original OOM Killer we can
see the messages in the log file. The syslog.conf file is the same for
both OOM Killer(Andrea and Original). Do you have any idea what is
happening to log file?
If you do not mind, you can accomplish the same test case as I
mentioned on my last email. I would like to know if this problem
happens to others people as well.
We tested on the laptop and desktop machines with 128MB of RAM and
swap space disabled.
BR,
Mauricio Lin.
On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> Sometimes the first application to be killed is XFree. AFAIK the
This makes more sense now. You need somebody trapping sigterm in order
to lockup and X sure traps it to recover the text console.
Can you replace this:
if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
force_sig(SIGTERM, p);
} else {
force_sig(SIGKILL, p);
}
with this?
force_sig(SIGKILL, p);
in mm/oom_kill.c.
This should fix it. Problem is that SIGTERM is unsafe even if the app is
not malicious, there's not enough ram to pagein the userland sighander,
so the system lockups.
We need a sort of timeout where we fallback into SIGKILL if SIGTERM
didn't help.
Anyway this is not a new bug, I didn't touch a single bit in that code.
I'd really like to see current fixes merged, then we can take care of
root app getting killed reliably. In all my test I always run the
malicious app as non-root, and anyway I never trap sigterm (X is tiny in
my setup, so it never gets killed). Probably the GUI stuff you opened
has increased significantly X size for X to be killed.
On Tue, 2005-01-25 at 20:11 -0400, Mauricio Lin wrote:
> > Can you please show the kernel messages ?
>
> OK. We will try to reach a situation that the printk messages can be
> written entirely in the log file and show you the kernel messages. But
> as I said: usually the printks messages are not written in the log
> file using Andrea's patch. But using the original OOM Killer we can
> see the messages in the log file. The syslog.conf file is the same for
> both OOM Killer(Andrea and Original). Do you have any idea what is
> happening to log file?
Add "console=ttyS0,115200" to your commandline so you get all the
messages on the serial console.
tglx
Hi Andrea,
On Wed, 26 Jan 2005 01:49:01 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> > Sometimes the first application to be killed is XFree. AFAIK the
>
> This makes more sense now. You need somebody trapping sigterm in order
> to lockup and X sure traps it to recover the text console.
>
> Can you replace this:
>
> if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> force_sig(SIGTERM, p);
> } else {
> force_sig(SIGKILL, p);
> }
>
> with this?
OK, let me test it. If I get some news, I will let you know.
>
> force_sig(SIGKILL, p);
>
> in mm/oom_kill.c.
BR,
Mauricio Lin.
Hi Andrea,
On Wed, 26 Jan 2005 01:49:01 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> > Sometimes the first application to be killed is XFree. AFAIK the
>
> This makes more sense now. You need somebody trapping sigterm in order
> to lockup and X sure traps it to recover the text console.
>
> Can you replace this:
>
> if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> force_sig(SIGTERM, p);
> } else {
> force_sig(SIGKILL, p);
> }
>
> with this?
>
> force_sig(SIGKILL, p);
>
> in mm/oom_kill.c.
Nice. Your suggestion made the error goes away.
We are still testing in order to compare between your OOM Killer and
Original OOM Killer.
BR,
Mauricio Lin.
On Thu, Jan 27, 2005 at 02:54:13PM -0400, Mauricio Lin wrote:
> Hi Andrea,
>
> On Wed, 26 Jan 2005 01:49:01 +0100, Andrea Arcangeli <[email protected]> wrote:
> > On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> > > Sometimes the first application to be killed is XFree. AFAIK the
> >
> > This makes more sense now. You need somebody trapping sigterm in order
> > to lockup and X sure traps it to recover the text console.
> >
> > Can you replace this:
> >
> > if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> > force_sig(SIGTERM, p);
> > } else {
> > force_sig(SIGKILL, p);
> > }
> >
> > with this?
> >
> > force_sig(SIGKILL, p);
> >
> > in mm/oom_kill.c.
>
> Nice. Your suggestion made the error goes away.
>
> We are still testing in order to compare between your OOM Killer and
> Original OOM Killer.
Ok, thanks for the confirmation. So my theory was right.
Basically we've to make this patch, now that you already edited the
code, can you diff and send a patch that will be the 6/5 in the serie?
(then after fixing this last very longstanding [now deadlock prone too]
bug, we can think how to make at a 7/5 that will wait a few seconds
after sending a sigterm, to fallback into a sigkill, that shouldn't be
difficult, but the above 6/5 will already make the code correct)
Note, if you add swap it'll workaround it too since then the memhog will
be allowed to grow to a larger rss than X. With 128m of ram and no swap,
X is one of the biggest with xshm involved from some client app
allocating lots of pictures. I could never notice since I always tested
it either with swap or on higher mem systems and my test box runs
with an idle X too which isn't that big ;).
Andrea Arcangeli <[email protected]> wrote:
>
> > > Can you replace this:
> > >
> > > if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> > > force_sig(SIGTERM, p);
> > > } else {
> > > force_sig(SIGKILL, p);
> > > }
> > >
> > > with this?
> > >
> > > force_sig(SIGKILL, p);
> > >
> > > in mm/oom_kill.c.
> >
> > Nice. Your suggestion made the error goes away.
> >
> > We are still testing in order to compare between your OOM Killer and
> > Original OOM Killer.
>
> Ok, thanks for the confirmation. So my theory was right.
>
> Basically we've to make this patch, now that you already edited the
> code, can you diff and send a patch that will be the 6/5 in the serie?
>
I've already queued a patch for this:
--- 25/mm/oom_kill.c~mm-fix-several-oom-killer-bugs-fix Thu Jan 27 13:56:58 2005
+++ 25-akpm/mm/oom_kill.c Thu Jan 27 13:57:19 2005
@@ -198,12 +198,7 @@ static void __oom_kill_task(task_t *p)
p->time_slice = HZ;
p->memdie = 1;
- /* This process has hardware access, be more careful. */
- if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
- force_sig(SIGTERM, p);
- } else {
- force_sig(SIGKILL, p);
- }
+ force_sig(SIGKILL, p);
}
static struct mm_struct *oom_kill_task(task_t *p)
However. This means that we'll now kill off tasks which had hardware
access. What are the implications of this?
On Thu, Jan 27, 2005 at 02:29:43PM -0800, Andrew Morton wrote:
> I've already queued a patch for this:
>
> --- 25/mm/oom_kill.c~mm-fix-several-oom-killer-bugs-fix Thu Jan 27 13:56:58 2005
> +++ 25-akpm/mm/oom_kill.c Thu Jan 27 13:57:19 2005
> @@ -198,12 +198,7 @@ static void __oom_kill_task(task_t *p)
> p->time_slice = HZ;
> p->memdie = 1;
>
> - /* This process has hardware access, be more careful. */
> - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> - force_sig(SIGTERM, p);
> - } else {
> - force_sig(SIGKILL, p);
> - }
> + force_sig(SIGKILL, p);
> }
>
> static struct mm_struct *oom_kill_task(task_t *p)
Thanks.
> However. This means that we'll now kill off tasks which had hardware
> access. What are the implications of this?
The implication of the above is basically that the X server won't be
able to restore the text mode, but that avoids the deadlock ;).
And they had not necessairly hardware access. They "might" have hardware
access. Note that an app may have hardware access even if it has no
rawio capabilities. One can run iopl and then change uid just fine. So
the above check is quite weak since it leaves the kernel susceptible to
bugs and memleaks in any app started by root. Kernel shouldn't trust
root apps, all apps are buggy, root apps too (I even once fixed a signal
race in /sbin/init that showed up with the schedule child first sched
optimization ;).
iopl and ioperm are the only two things we care about. We can a
synchronous reliable eflags/ioperm value only from the "regs" in the
task context. Problem is that since we can pick a task to kill that
isn't necessairly the current task, we should start to approximate, and
assume the process is sleeping. The regs must be saved during
reschedule, so it should cache the old contents. So perhaps we can get a
pratically reliable eflags dump from the tss_struct. But this will not
be common code and it'll require a specialized arch API. Like
has_hw_access(). Only then we can make a stronger assumption and be
truly careful about sending SIGKILL.
The right way to do this is probably to wait a few seconds before
sending the sigkill. I'm not currently sure if it worth adding the
has_hw_access(). But certainly I would prefer to do nothing special with
only the sys_rawio capability. I thought I could wait the other patches
to be merged to avoid confusion before making more changes (since it'd
be a pretty self contained feature), but I can do that now if you
prefer.
On Thu, Jan 27, 2005 at 03:35:35PM -0800, Andrew Morton wrote:
> On x86 we could perhaps test for non-nullness of tsk->thread->io_bitmap_ptr?
yes for ioports. But I'm afraid I was too optimistic about eflags for
iopl, that's not in the per-task tss, it's only stored at the very top
of the kernel stack and inherit during fork/clone. So we probably need
to check esp0 and read the top of the stack to see if a task has eflags
set. esp0 is definitely stored in the thread struct when the task is
rescheduled, and it cannot change for each given task, so we can access
it even while the task is runnable and it shouldn't be corrupted by
iret. But the problem is sysenter is optimized not to save eflags on the
kernel stack, so the top of the stack - 12bytes would not contain eflags
if sysenter is in use.
So basically we'd need to change iopl to propagate the info to the task
struct synchronously somehow, because we can't read it reliably from the
kernel stack.
Andrea Arcangeli <[email protected]> wrote:
>
> And they had not necessairly hardware access. They "might" have hardware
> access.
On x86 we could perhaps test for non-nullness of tsk->thread->io_bitmap_ptr?
> I thought I could wait the other patches
> to be merged to avoid confusion before making more changes (since it'd
> be a pretty self contained feature), but I can do that now if you
> prefer.
I'll send your current stuff off to Linus in the next few days - we can let
that sit for a while, use that as a base for further work.
Hi Andrea,
On Thu, 27 Jan 2005 23:11:29 +0100, Andrea Arcangeli <[email protected]> wrote:
> On Thu, Jan 27, 2005 at 02:54:13PM -0400, Mauricio Lin wrote:
> > Hi Andrea,
> >
> > On Wed, 26 Jan 2005 01:49:01 +0100, Andrea Arcangeli <[email protected]> wrote:
> > > On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> > > > Sometimes the first application to be killed is XFree. AFAIK the
> > >
> > > This makes more sense now. You need somebody trapping sigterm in order
> > > to lockup and X sure traps it to recover the text console.
> > >
> > > Can you replace this:
> > >
> > > if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> > > force_sig(SIGTERM, p);
> > > } else {
> > > force_sig(SIGKILL, p);
> > > }
> > >
> > > with this?
> > >
> > > force_sig(SIGKILL, p);
> > >
> > > in mm/oom_kill.c.
> >
> > Nice. Your suggestion made the error goes away.
> >
> > We are still testing in order to compare between your OOM Killer and
> > Original OOM Killer.
>
> Ok, thanks for the confirmation. So my theory was right.
>
> Basically we've to make this patch, now that you already edited the
> code, can you diff and send a patch that will be the 6/5 in the serie?
OK. I will send the patch.
> (then after fixing this last very longstanding [now deadlock prone too]
> bug, we can think how to make at a 7/5 that will wait a few seconds
> after sending a sigterm, to fallback into a sigkill, that shouldn't be
> difficult, but the above 6/5 will already make the code correct)
>
> Note, if you add swap it'll workaround it too since then the memhog will
> be allowed to grow to a larger rss than X. With 128m of ram and no swap,
> X is one of the biggest with xshm involved from some client app
> allocating lots of pictures. I could never notice since I always tested
> it either with swap or on higher mem systems and my test box runs
> with an idle X too which isn't that big ;).
Well, we like to reduce the memory resources, because we also think
about OOM Killer in small devices with few resources.
BR,
Mauricio Lin.
Hi Andrea,
On Fri, 28 Jan 2005 09:58:24 -0400, Mauricio Lin <[email protected]> wrote:
> Hi Andrea,
>
> On Thu, 27 Jan 2005 23:11:29 +0100, Andrea Arcangeli <[email protected]> wrote:
> > On Thu, Jan 27, 2005 at 02:54:13PM -0400, Mauricio Lin wrote:
> > > Hi Andrea,
> > >
> > > On Wed, 26 Jan 2005 01:49:01 +0100, Andrea Arcangeli <[email protected]> wrote:
> > > > On Tue, Jan 25, 2005 at 08:11:19PM -0400, Mauricio Lin wrote:
> > > > > Sometimes the first application to be killed is XFree. AFAIK the
> > > >
> > > > This makes more sense now. You need somebody trapping sigterm in order
> > > > to lockup and X sure traps it to recover the text console.
> > > >
> > > > Can you replace this:
> > > >
> > > > if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
> > > > force_sig(SIGTERM, p);
> > > > } else {
> > > > force_sig(SIGKILL, p);
> > > > }
> > > >
> > > > with this?
> > > >
> > > > force_sig(SIGKILL, p);
> > > >
> > > > in mm/oom_kill.c.
> > >
> > > Nice. Your suggestion made the error goes away.
> > >
> > > We are still testing in order to compare between your OOM Killer and
> > > Original OOM Killer.
> >
> > Ok, thanks for the confirmation. So my theory was right.
> >
> > Basically we've to make this patch, now that you already edited the
> > code, can you diff and send a patch that will be the 6/5 in the serie?
>
> OK. I will send the patch.
As you know, Andrew generated the patch. Here goes some test results
about your OOM Killer and the Original OOm Killer. We accomplished 10
experiments for each OOM Killer and below are average values.
"Invocations" is the number of times that out_of_memory function is
called. "Selections" is the number of times that select_bad_process
function is called and "Killed" is the number of killed process.
Original OOM Killer
Invocations average = 51620/10 = 5162
Selections average = 30/10 = 3
Killed average = 38/10 = 3.8
Andrea OOM Killer
Invocations average = 213/10 = 21.3
Selections average = 213/10 = 21.3
Killed average = 52/10 = 5.2
As you can see the number of invocations reduced significantly using
your OOM Killer.
I did not know about this problem when I was moving the original
ranking algorithm to userland. As Thomaz mentioned: invocation
madness, reentrancy problems and those strange timers and counter as
now, since, last, lastkill and count. I guess that now i can put some
OOM Killer stuffs in userland in a safer manner with those problems
solved, right?
BTW, will your OOM Killer be included in the kernel tree?
BR,
Mauricio Lin.
On Fri, Jan 28, 2005 at 11:21:11AM -0400, Mauricio Lin wrote:
> As you know, Andrew generated the patch. Here goes some test results
> about your OOM Killer and the Original OOm Killer. We accomplished 10
> experiments for each OOM Killer and below are average values.
>
> "Invocations" is the number of times that out_of_memory function is
> called. "Selections" is the number of times that select_bad_process
> function is called and "Killed" is the number of killed process.
>
> Original OOM Killer
> Invocations average = 51620/10 = 5162
> Selections average = 30/10 = 3
> Killed average = 38/10 = 3.8
>
> Andrea OOM Killer
> Invocations average = 213/10 = 21.3
> Selections average = 213/10 = 21.3
> Killed average = 52/10 = 5.2
>
> As you can see the number of invocations reduced significantly using
> your OOM Killer.
Yep, thanks for testing!
> I did not know about this problem when I was moving the original
> ranking algorithm to userland. As Thomaz mentioned: invocation
> madness, reentrancy problems and those strange timers and counter as
> now, since, last, lastkill and count. I guess that now i can put some
> OOM Killer stuffs in userland in a safer manner with those problems
> solved, right?
Yep ;)
> BTW, will your OOM Killer be included in the kernel tree?
Yes, Andrew said it should go in the next few days, which is a great
news, thanks everyone!