These are dump of patches just for showing concept, what I want to do.
But not tested. please see if you have free time. (you can ignore ;)
Anyway, this will HUNK to the latest mmotm, Kirill's work is merged.
This is not related to David's work. I don't hesitate to rebase mine
to the mmotm if his one is merged, it's easy.
But I'm not sure his one goes to mm soon.
1st patch is for better handling oom-kill under memcg.
2nd patch is oom-notifier and oom-kill-disable for memcg knob.
I'm sorry that I'll be absent tomorrow.
==
From: KAMEZAWA Hiroyuki <[email protected]>
This is updated version of oom handling improvement om memcg.
But all codes are totaly renewed. This may not be sophisiticated well but
enough for showing idea.
This patch does following things.
* set "memcg is under OOM" if somone gets into OOM under a memcg.
like zone's OOM lock, tree-of-memcg is marked as under OOM.
By this. simlutabeous OOM kill in a tree will not happen.
* When other threads try to reclaim memory or call oom-kill, it
checks its own target memcg is under oom or not. If someone
calls oom-killer already, the thread will be queued to waitq.
* At some event which makes room for new memory, threads on waitq
are waken up.
** A page (or chunk of pages) are unchraged.
** A task is moved.
** limit is enlarged.
And this patch also allows to check "current's memcg is changed or not"
while charging.
Considering what admin/daemon can do when it notice OOM,
* kill a process
* move a process (to other cgroup which has free area)
* remove a file (on tmpfs or some)
* enlarge limit
I think all chances for wakeing up waiters are covered by these.
After this patch, memcg's accounting will not fail in usual path.
If all tasks are OOM_DISABLE, memcg may hang. But admin can have
several options described in above. So, oom notifier+freeze should be
implemented.
TODO: maybe not difficult.
* Add oom notifier. (can reuse memory.threashold ?)
* Add a switch for oom-freeze rather than oom-kill.
Cc: David Rientjes <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Daisuke Nishimura <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/memcontrol.h | 6 -
mm/memcontrol.c | 208 +++++++++++++++++++++++++++++++++++----------
mm/oom_kill.c | 11 --
3 files changed, 167 insertions(+), 58 deletions(-)
Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -200,7 +200,6 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long last_oom_jiffies;
atomic_t refcnt;
unsigned int swappiness;
@@ -223,6 +222,8 @@ struct mem_cgroup {
*/
unsigned long move_charge_at_immigrate;
+ /* counting ongoing OOM requests under sub hierarchy */
+ atomic_t oom_count;
/*
* percpu counter.
*/
@@ -1096,6 +1097,89 @@ done:
}
/*
+ * set/under memcg_oom counting is done under mutex.
+ */
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+static int set_memcg_oom_cb(struct mem_cgroup *mem, void *data)
+{
+ int *max_count = (int*)data;
+ int count = atomic_inc_return(&mem->oom_count);
+ if (*max_count < count)
+ *max_count = count;
+ return 0;
+}
+
+static int unset_memcg_oom_cb(struct mem_cgroup *mem, void *data)
+{
+ atomic_set(&mem->oom_count, 0);
+ return 0;
+}
+
+static bool memcg_under_oom(struct mem_cgroup *mem)
+{
+ if (atomic_read(&mem->oom_count))
+ return true;
+ return false;
+}
+
+static void memcg_oom_wait(struct mem_cgroup *mem)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
+ if (memcg_under_oom(mem))
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+}
+
+static void memcg_oom_wake(void)
+{
+ /* This may wake up unnecessary tasks..but it's not big problem */
+ wake_up_all(&memcg_oom_waitq);
+}
+/*
+ * Check there are ongoing oom-kill in this hierarchy or not.
+ * If now under oom-kill, wait for some event to restart job.
+ */
+static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+{
+ int oom_count = 0;
+ DEFINE_WAIT(wait);
+ /*
+ * Considering hierarchy (below)
+ * /A
+ * /01
+ * /02
+ * If 01 or 02 is under oom-kill, oom-kill in A should wait.
+ * If "A" is under oom-kill, oom-kill in 01 and 02 should wait.
+ * (task in 01/02 can be killed.)
+ * But if 01 is under oom-kill, 02's oom-kill is independent from it.
+ */
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
+ mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
+ /* Am I the 1st oom killer in this sub hierarchy ? */
+ if (oom_count == 1) {
+ finish_wait(&memcg_oom_waitq, &wait);
+ mem_cgroup_out_of_memory(mem, mask);
+ mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
+ } else {
+ /*
+ * Wakeup is called when
+ * 1. pages are uncharged. (by killed, or removal of a file)
+ * 2. limit is enlarged.
+ * 3. a task is moved.
+ */
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+ }
+ if (test_thread_flag(TIF_MEMDIE))
+ return false;
+ return true;
+}
+
+/*
* This function returns the number of memcg under hierarchy tree. Returns
* 1(self count) if no children.
*/
@@ -1234,34 +1318,6 @@ static int mem_cgroup_hierarchical_recla
return total;
}
-bool mem_cgroup_oom_called(struct task_struct *task)
-{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
-
- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;
-}
-
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
-{
- mem->last_oom_jiffies = jiffies;
- return 0;
-}
-
-static void record_last_oom(struct mem_cgroup *mem)
-{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
-}
-
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
@@ -1419,6 +1475,7 @@ static int __cpuinit memcg_stock_cpu_cal
return NOTIFY_OK;
}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
@@ -1427,17 +1484,21 @@ static int __mem_cgroup_try_charge(struc
gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom, struct page *page)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem, *mem_over_limit, *recorded;
+ int nr_retries, csize;
struct res_counter *fail_res;
- int csize = CHARGE_SIZE;
+
+start:
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ recorded = *memcg;
+ csize = CHARGE_SIZE;
+ mem = NULL;
if (unlikely(test_thread_flag(TIF_MEMDIE))) {
/* Don't account this! */
*memcg = NULL;
return 0;
}
-
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
@@ -1489,6 +1550,12 @@ static int __mem_cgroup_try_charge(struc
}
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
+ /* already in OOM ? */
+ if (memcg_under_oom(mem_over_limit)) {
+ /* Don't add too much pressure to the host */
+ memcg_oom_wait(mem_over_limit);
+ goto retry;
+ }
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
gfp_mask, flags);
@@ -1549,11 +1616,15 @@ static int __mem_cgroup_try_charge(struc
}
if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
- }
- goto nomem;
+
+ if (!oom)
+ goto nomem;
+ /* returnes false if current is killed */
+ if (memcg_handle_oom(mem_over_limit, gfp_mask))
+ goto retry;
+ /* For smooth oom-kill of current, return 0 */
+ css_put(&mem->css);
+ return 0;
}
}
if (csize > PAGE_SIZE)
@@ -1572,6 +1643,15 @@ done:
nomem:
css_put(&mem->css);
return -ENOMEM;
+
+retry:
+ /*
+ * current's mem_cgroup can be moved while we're waiting for
+ * memory reclaim or OOM-Kill.
+ */
+ *memcg = recorded;
+ css_put(&mem->css);
+ goto start;
}
/*
@@ -1589,6 +1669,9 @@ static void __mem_cgroup_cancel_charge(s
VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
WARN_ON_ONCE(count > INT_MAX);
__css_put(&mem->css, (int)count);
+
+ if (memcg_under_oom(mem))
+ memcg_oom_wake();
}
/* we don't need css_put for root */
}
@@ -2061,6 +2144,10 @@ direct_uncharge:
res_counter_uncharge(&mem->res, PAGE_SIZE);
if (uncharge_memsw)
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ /* Slow path to check OOM waiters */
+ if (!current->memcg_batch.do_batch || batch->memcg != mem)
+ if (memcg_under_oom(mem))
+ memcg_oom_wake();
return;
}
@@ -2200,6 +2287,9 @@ void mem_cgroup_uncharge_end(void)
res_counter_uncharge(&batch->memcg->res, batch->bytes);
if (batch->memsw_bytes)
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+
+ if (memcg_under_oom(batch->memcg))
+ memcg_oom_wake();
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
}
@@ -2408,8 +2498,7 @@ void mem_cgroup_end_migration(struct mem
/*
* A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * Calling hierarchical_reclaim is not enough because we have to hand oom-kill.
* Moreover considering hierarchy, we should reclaim from the mem_over_limit,
* not from the memcg which this page would be charged to.
* try_charge_swapin does all of these works properly.
@@ -2440,7 +2529,8 @@ static int mem_cgroup_resize_limit(struc
u64 memswlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
- u64 curusage, oldusage;
+ u64 curusage, oldusage, curlimit;
+ int enlarge = 0;
/*
* For keeping hierarchical_reclaim simple, how long we should retry
@@ -2451,6 +2541,7 @@ static int mem_cgroup_resize_limit(struc
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
+
while (retry_count) {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2468,6 +2559,9 @@ static int mem_cgroup_resize_limit(struc
mutex_unlock(&set_limit_mutex);
break;
}
+ curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (curlimit < val)
+ enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
if (!ret) {
if (memswlimit == val)
@@ -2477,8 +2571,20 @@ static int mem_cgroup_resize_limit(struc
}
mutex_unlock(&set_limit_mutex);
- if (!ret)
+ if (!ret) {
+ /*
+ * If we enlarge limit of memcg under OOM,
+ * wake up waiters.
+ */
+ if (enlarge && memcg_under_oom(memcg))
+ memcg_oom_wake();
+ break;
+ }
+ /* Under OOM ? If so, don't add more pressure. */
+ if (memcg_under_oom(memcg)) {
+ ret = -EBUSY;
break;
+ }
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_SHRINK);
@@ -2497,9 +2603,10 @@ static int mem_cgroup_resize_memsw_limit
unsigned long long val)
{
int retry_count;
- u64 memlimit, oldusage, curusage;
+ u64 memlimit, oldusage, curusage, curlimit;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
+ int enlarge;
/* see mem_cgroup_resize_res_limit */
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2521,6 +2628,9 @@ static int mem_cgroup_resize_memsw_limit
mutex_unlock(&set_limit_mutex);
break;
}
+ curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ if (curlimit < val)
+ enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
if (!ret) {
if (memlimit == val)
@@ -2530,8 +2640,15 @@ static int mem_cgroup_resize_memsw_limit
}
mutex_unlock(&set_limit_mutex);
- if (!ret)
+ if (!ret) {
+ if (enlarge && memcg_under_oom(memcg))
+ memcg_oom_wake();
break;
+ }
+ if (memcg_under_oom(memcg)) {
+ ret = -EBUSY;
+ continue;
+ }
mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
MEM_CGROUP_RECLAIM_NOSWAP |
@@ -3859,6 +3976,9 @@ one_by_one:
ret = -EINTR;
break;
}
+ /* Undo precharges if there is ongoing OOM */
+ if (memcg_under_oom(mem))
+ return -ENOMEM;
if (!batch_count--) {
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Feb11/mm/oom_kill.c
@@ -487,6 +487,9 @@ retry:
goto retry;
out:
read_unlock(&tasklist_lock);
+ /* give a chance to die for selected process */
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
}
#endif
@@ -601,13 +604,6 @@ void pagefault_out_of_memory(void)
/* Got some memory back in the last second. */
return;
- /*
- * If this is from memcg, oom-killer is already invoked.
- * and not worth to go system-wide-oom.
- */
- if (mem_cgroup_oom_called(current))
- goto rest_and_return;
-
if (sysctl_panic_on_oom)
panic("out of memory from page fault. panic_on_oom is selected.\n");
@@ -619,7 +615,6 @@ void pagefault_out_of_memory(void)
* Give "p" a good chance of killing itself before we
* retry to allocate memory.
*/
-rest_and_return:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
+++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
return false;
}
-extern bool mem_cgroup_oom_called(struct task_struct *task);
void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
return true;
}
-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
- return false;
-}
-
static inline int
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
From: KAMEZAWA Hiroyuki <[email protected]>
This is just a toy for considering problem.
memcg's OOM means "the usage hits limit" and doesn't mean "there is no
resource.". So, user-land daemon may be able to do better jobs than
default oom-killer.
This patch adds
- oom notifier for memcg.
Implementation is baed on threshold notifier.
- dislable_oom flag.
If set, avoid to call oom-killer and wait for event (uncharge etc..)
Assume a user land daemon which works on root cgroup.
- the daemon registers event fd to wait for memcg's OOM
% ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes OOM
- set memcg's oom-killing disabled.
% echo 1 > /cgroup/A/memory.disable_oom_kill
After wakeup, the daemon can...
- enlarge limit. (adding swap etc.)
- kill some processes.
- move processes to other group.
- send signal to _important_ processes to safe terminate and
send SIGSTOP to others. ennlarge limit for a while.
TODO:
- many...?
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 144 +++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 117 insertions(+), 27 deletions(-)
Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -216,6 +216,10 @@ struct mem_cgroup {
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_threshold_ary *memsw_thresholds;
+ /* Notifiers for OOM situation */
+ struct mem_cgroup_threshold_ary *oom_notify;
+ int oom_kill_disabled;
+
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
@@ -1143,6 +1147,7 @@ static void memcg_oom_wake(void)
* Check there are ongoing oom-kill in this hierarchy or not.
* If now under oom-kill, wait for some event to restart job.
*/
+static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
int oom_count = 0;
@@ -1161,8 +1166,14 @@ static bool memcg_handle_oom(struct mem_
mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
/* Am I the 1st oom killer in this sub hierarchy ? */
if (oom_count == 1) {
- finish_wait(&memcg_oom_waitq, &wait);
- mem_cgroup_out_of_memory(mem, mask);
+ mem_cgroup_oom_notify(mem);
+ if (!mem->oom_kill_disabled) {
+ finish_wait(&memcg_oom_waitq, &wait);
+ mem_cgroup_out_of_memory(mem, mask);
+ } else { /* give chance admin daemon to run */
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+ }
mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
} else {
/*
@@ -3141,6 +3152,35 @@ static int mem_cgroup_move_charge_write(
return 0;
}
+static u64 mem_cgroup_oom_kill_disable_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return mem_cgroup_from_cont(cgrp)->oom_kill_disabled;
+}
+
+static int mem_cgroup_oom_kill_disable_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct cgroup *parent = cgrp->parent;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent_mem = NULL;
+ int retval = 0;
+
+ if (val > 1)
+ return -EINVAL;
+ /*
+ * can be set only to root cgroup.
+ */
+ if (parent)
+ parent_mem = mem_cgroup_from_cont(parent);
+ cgroup_lock();
+ if (!parent_mem || !parent_mem->use_hierarchy)
+ mem->oom_kill_disabled = val;
+ else
+ retval = -EINVAL;
+ cgroup_unlock();
+ return retval;
+}
/* For read statistics */
enum {
@@ -3405,6 +3445,25 @@ static int compare_thresholds(const void
return _a->threshold - _b->threshold;
}
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_threshold_ary *t;
+ int i;
+
+ rcu_read_lock();
+ t = rcu_dereference(mem->oom_notify);
+
+ for (i = 0; i < t->size; i++)
+ eventfd_signal(t->entries[i].eventfd, 1);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
+{
+ mem_cgroup_walk_tree(memcg, NULL, mem_cgroup_oom_notify_cb);
+}
+
static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
struct eventfd_ctx *eventfd, const char *args)
{
@@ -3414,23 +3473,30 @@ static int mem_cgroup_register_event(str
u64 threshold, usage;
int size;
int i, ret;
+ int oom = 0;
ret = res_counter_memparse_write_strategy(args, &threshold);
- if (ret)
+ if (ret) {
+ if (!strcmp(args, "oom") || !strcmp(args, "OOM"))
+ oom = 1;
return ret;
-
+ }
mutex_lock(&memcg->thresholds_lock);
- if (type == _MEM)
- thresholds = memcg->thresholds;
- else if (type == _MEMSWAP)
- thresholds = memcg->memsw_thresholds;
- else
- BUG();
+ if (!oom) {
+ if (type == _MEM)
+ thresholds = memcg->thresholds;
+ else if (type == _MEMSWAP)
+ thresholds = memcg->memsw_thresholds;
+ else
+ BUG();
+ } else {
+ thresholds = memcg->oom_notify;
+ }
usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
/* Check if a threshold crossed before adding a new one */
- if (thresholds)
+ if (!oom && thresholds)
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
if (thresholds)
@@ -3458,20 +3524,22 @@ static int mem_cgroup_register_event(str
thresholds_new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(thresholds_new->entries, size,
+ if (!oom) {
+ sort(thresholds_new->entries, size,
sizeof(struct mem_cgroup_threshold),
compare_thresholds, NULL);
- /* Find current threshold */
- atomic_set(&thresholds_new->current_threshold, -1);
- for (i = 0; i < size; i++) {
- if (thresholds_new->entries[i].threshold < usage) {
+ /* Find current threshold */
+ atomic_set(&thresholds_new->current_threshold, -1);
+ for (i = 0; i < size; i++) {
+ if (thresholds_new->entries[i].threshold < usage) {
/*
* thresholds_new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
* it here.
*/
- atomic_inc(&thresholds_new->current_threshold);
+ atomic_inc(&thresholds_new->current_threshold);
+ }
}
}
@@ -3480,11 +3548,12 @@ static int mem_cgroup_register_event(str
* will be unregistered before calling __mem_cgroup_free()
*/
mem_cgroup_get(memcg);
-
- if (type == _MEM)
+ if (oom)
+ rcu_assign_pointer(memcg->oom_notify, thresholds_new);
+ else if (type == _MEM)
rcu_assign_pointer(memcg->thresholds, thresholds_new);
else
- rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
+ rcu_assign_pointer(memcg->memsw_thresholds,thresholds_new);
/* To be sure that nobody uses thresholds before freeing it */
synchronize_rcu();
@@ -3502,8 +3571,9 @@ static int mem_cgroup_unregister_event(s
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
int type = MEMFILE_TYPE(cft->private);
- u64 usage;
+ u64 usage = 0;
int size = 0;
+ int oom = 0;
int i, j, ret;
mutex_lock(&memcg->thresholds_lock);
@@ -3513,17 +3583,29 @@ static int mem_cgroup_unregister_event(s
thresholds = memcg->memsw_thresholds;
else
BUG();
-
+ /* check it's oom notify or not */
+ if (memcg->oom_notify) {
+ for (i = 0; i < memcg->oom_notify->size; i++) {
+ if (memcg->oom_notify->entries[i].eventfd ==
+ eventfd) {
+ thresholds = memcg->oom_notify;
+ oom = 1;
+ break;
+ }
+ }
+ }
/*
* Something went wrong if we trying to unregister a threshold
* if we don't have thresholds
*/
BUG_ON(!thresholds);
- usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+ if (!oom) {
+ usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
- /* Check if a threshold crossed before removing */
- __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+ /* Check if a threshold crossed before removing */
+ __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+ }
/* Calculate new number of threshold */
for (i = 0; i < thresholds->size; i++) {
@@ -3554,7 +3636,7 @@ static int mem_cgroup_unregister_event(s
continue;
thresholds_new->entries[j] = thresholds->entries[i];
- if (thresholds_new->entries[j].threshold < usage) {
+ if (!oom && thresholds_new->entries[j].threshold < usage) {
/*
* thresholds_new->current_threshold will not be used
* until rcu_assign_pointer(), so it's safe to increment
@@ -3566,7 +3648,9 @@ static int mem_cgroup_unregister_event(s
}
assign:
- if (type == _MEM)
+ if (oom)
+ rcu_assign_pointer(memcg->oom_notify, thresholds_new);
+ else if (type == _MEM)
rcu_assign_pointer(memcg->thresholds, thresholds_new);
else
rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
@@ -3639,6 +3723,11 @@ static struct cftype mem_cgroup_files[]
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
},
+ {
+ .name = "disable_oom_kill",
+ .read_u64 = mem_cgroup_oom_kill_disable_read,
+ .write_u64 = mem_cgroup_oom_kill_disable_write,
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3886,6 +3975,7 @@ mem_cgroup_create(struct cgroup_subsys *
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
+ mem->oom_kill_disabled = parent->oom_kill_disabled;
} else {
res_counter_init(&mem->res, NULL);
res_counter_init(&mem->memsw, NULL);
On Wed, 24 Feb 2010 16:59:21 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> These are dump of patches just for showing concept, what I want to do.
> But not tested. please see if you have free time. (you can ignore ;)
>
> Anyway, this will HUNK to the latest mmotm, Kirill's work is merged.
>
> This is not related to David's work. I don't hesitate to rebase mine
> to the mmotm if his one is merged, it's easy.
> But I'm not sure his one goes to mm soon.
>
> 1st patch is for better handling oom-kill under memcg.
It's bigger than I expected, but it basically looks good to me.
> 2nd patch is oom-notifier and oom-kill-disable for memcg knob.
>
This feature is very atractive.
One comment to this patch for now.
> +/*
> + * Check there are ongoing oom-kill in this hierarchy or not.
> + * If now under oom-kill, wait for some event to restart job.
> + */
> +static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> +{
> + int oom_count = 0;
> + DEFINE_WAIT(wait);
> + /*
> + * Considering hierarchy (below)
> + * /A
> + * /01
> + * /02
> + * If 01 or 02 is under oom-kill, oom-kill in A should wait.
> + * If "A" is under oom-kill, oom-kill in 01 and 02 should wait.
> + * (task in 01/02 can be killed.)
> + * But if 01 is under oom-kill, 02's oom-kill is independent from it.
> + */
> + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> + mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
> + /* Am I the 1st oom killer in this sub hierarchy ? */
> + if (oom_count == 1) {
> + finish_wait(&memcg_oom_waitq, &wait);
> + mem_cgroup_out_of_memory(mem, mask);
> + mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
I think we need call memcg_oom_wake() here. Some contexts might have slept already,
but other callers of memcg_oom_wake() calle it after checking memcg_under_oom(),
so if we don't wake them up here, they continue to sleep, IIUC.
Thanks,
Daisuke Nishimura.
> I'm sorry that I'll be absent tomorrow.
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> This is updated version of oom handling improvement om memcg.
> But all codes are totaly renewed. This may not be sophisiticated well but
> enough for showing idea.
>
> This patch does following things.
> * set "memcg is under OOM" if somone gets into OOM under a memcg.
> like zone's OOM lock, tree-of-memcg is marked as under OOM.
> By this. simlutabeous OOM kill in a tree will not happen.
>
> * When other threads try to reclaim memory or call oom-kill, it
> checks its own target memcg is under oom or not. If someone
> calls oom-killer already, the thread will be queued to waitq.
>
> * At some event which makes room for new memory, threads on waitq
> are waken up.
> ** A page (or chunk of pages) are unchraged.
> ** A task is moved.
> ** limit is enlarged.
>
> And this patch also allows to check "current's memcg is changed or not"
> while charging.
>
> Considering what admin/daemon can do when it notice OOM,
> * kill a process
> * move a process (to other cgroup which has free area)
> * remove a file (on tmpfs or some)
> * enlarge limit
> I think all chances for wakeing up waiters are covered by these.
>
> After this patch, memcg's accounting will not fail in usual path.
> If all tasks are OOM_DISABLE, memcg may hang. But admin can have
> several options described in above. So, oom notifier+freeze should be
> implemented.
>
> TODO: maybe not difficult.
> * Add oom notifier. (can reuse memory.threashold ?)
> * Add a switch for oom-freeze rather than oom-kill.
>
> Cc: David Rientjes <[email protected]>
> Cc: Balbir Singh <[email protected]>
> Cc: Daisuke Nishimura <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> include/linux/memcontrol.h | 6 -
> mm/memcontrol.c | 208 +++++++++++++++++++++++++++++++++++----------
> mm/oom_kill.c | 11 --
> 3 files changed, 167 insertions(+), 58 deletions(-)
>
> Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> @@ -200,7 +200,6 @@ struct mem_cgroup {
> * Should the accounting and control be hierarchical, per subtree?
> */
> bool use_hierarchy;
> - unsigned long last_oom_jiffies;
> atomic_t refcnt;
>
> unsigned int swappiness;
> @@ -223,6 +222,8 @@ struct mem_cgroup {
> */
> unsigned long move_charge_at_immigrate;
>
> + /* counting ongoing OOM requests under sub hierarchy */
> + atomic_t oom_count;
> /*
> * percpu counter.
> */
> @@ -1096,6 +1097,89 @@ done:
> }
>
> /*
> + * set/under memcg_oom counting is done under mutex.
> + */
> +static DEFINE_MUTEX(memcg_oom_mutex);
> +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
> +
> +static int set_memcg_oom_cb(struct mem_cgroup *mem, void *data)
> +{
> + int *max_count = (int*)data;
> + int count = atomic_inc_return(&mem->oom_count);
> + if (*max_count < count)
> + *max_count = count;
> + return 0;
> +}
> +
> +static int unset_memcg_oom_cb(struct mem_cgroup *mem, void *data)
> +{
> + atomic_set(&mem->oom_count, 0);
> + return 0;
> +}
> +
> +static bool memcg_under_oom(struct mem_cgroup *mem)
> +{
> + if (atomic_read(&mem->oom_count))
> + return true;
> + return false;
> +}
> +
> +static void memcg_oom_wait(struct mem_cgroup *mem)
> +{
> + DEFINE_WAIT(wait);
> +
> + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> + if (memcg_under_oom(mem))
> + schedule();
> + finish_wait(&memcg_oom_waitq, &wait);
> +}
> +
> +static void memcg_oom_wake(void)
> +{
> + /* This may wake up unnecessary tasks..but it's not big problem */
> + wake_up_all(&memcg_oom_waitq);
> +}
> +/*
> + * Check there are ongoing oom-kill in this hierarchy or not.
> + * If now under oom-kill, wait for some event to restart job.
> + */
> +static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> +{
> + int oom_count = 0;
> + DEFINE_WAIT(wait);
> + /*
> + * Considering hierarchy (below)
> + * /A
> + * /01
> + * /02
> + * If 01 or 02 is under oom-kill, oom-kill in A should wait.
> + * If "A" is under oom-kill, oom-kill in 01 and 02 should wait.
> + * (task in 01/02 can be killed.)
> + * But if 01 is under oom-kill, 02's oom-kill is independent from it.
> + */
> + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> + mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
> + /* Am I the 1st oom killer in this sub hierarchy ? */
> + if (oom_count == 1) {
> + finish_wait(&memcg_oom_waitq, &wait);
> + mem_cgroup_out_of_memory(mem, mask);
> + mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
> + } else {
> + /*
> + * Wakeup is called when
> + * 1. pages are uncharged. (by killed, or removal of a file)
> + * 2. limit is enlarged.
> + * 3. a task is moved.
> + */
> + schedule();
> + finish_wait(&memcg_oom_waitq, &wait);
> + }
> + if (test_thread_flag(TIF_MEMDIE))
> + return false;
> + return true;
> +}
> +
> +/*
> * This function returns the number of memcg under hierarchy tree. Returns
> * 1(self count) if no children.
> */
> @@ -1234,34 +1318,6 @@ static int mem_cgroup_hierarchical_recla
> return total;
> }
>
> -bool mem_cgroup_oom_called(struct task_struct *task)
> -{
> - bool ret = false;
> - struct mem_cgroup *mem;
> - struct mm_struct *mm;
> -
> - rcu_read_lock();
> - mm = task->mm;
> - if (!mm)
> - mm = &init_mm;
> - mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
> - ret = true;
> - rcu_read_unlock();
> - return ret;
> -}
> -
> -static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
> -{
> - mem->last_oom_jiffies = jiffies;
> - return 0;
> -}
> -
> -static void record_last_oom(struct mem_cgroup *mem)
> -{
> - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
> -}
> -
> /*
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> @@ -1419,6 +1475,7 @@ static int __cpuinit memcg_stock_cpu_cal
> return NOTIFY_OK;
> }
>
> +
> /*
> * Unlike exported interface, "oom" parameter is added. if oom==true,
> * oom-killer can be invoked.
> @@ -1427,17 +1484,21 @@ static int __mem_cgroup_try_charge(struc
> gfp_t gfp_mask, struct mem_cgroup **memcg,
> bool oom, struct page *page)
> {
> - struct mem_cgroup *mem, *mem_over_limit;
> - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> + struct mem_cgroup *mem, *mem_over_limit, *recorded;
> + int nr_retries, csize;
> struct res_counter *fail_res;
> - int csize = CHARGE_SIZE;
> +
> +start:
> + nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> + recorded = *memcg;
> + csize = CHARGE_SIZE;
> + mem = NULL;
>
> if (unlikely(test_thread_flag(TIF_MEMDIE))) {
> /* Don't account this! */
> *memcg = NULL;
> return 0;
> }
> -
> /*
> * We always charge the cgroup the mm_struct belongs to.
> * The mm_struct's mem_cgroup changes on task migration if the
> @@ -1489,6 +1550,12 @@ static int __mem_cgroup_try_charge(struc
> }
> if (!(gfp_mask & __GFP_WAIT))
> goto nomem;
> + /* already in OOM ? */
> + if (memcg_under_oom(mem_over_limit)) {
> + /* Don't add too much pressure to the host */
> + memcg_oom_wait(mem_over_limit);
> + goto retry;
> + }
>
> ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> gfp_mask, flags);
> @@ -1549,11 +1616,15 @@ static int __mem_cgroup_try_charge(struc
> }
>
> if (!nr_retries--) {
> - if (oom) {
> - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> - record_last_oom(mem_over_limit);
> - }
> - goto nomem;
> +
> + if (!oom)
> + goto nomem;
> + /* returnes false if current is killed */
> + if (memcg_handle_oom(mem_over_limit, gfp_mask))
> + goto retry;
> + /* For smooth oom-kill of current, return 0 */
> + css_put(&mem->css);
> + return 0;
> }
> }
> if (csize > PAGE_SIZE)
> @@ -1572,6 +1643,15 @@ done:
> nomem:
> css_put(&mem->css);
> return -ENOMEM;
> +
> +retry:
> + /*
> + * current's mem_cgroup can be moved while we're waiting for
> + * memory reclaim or OOM-Kill.
> + */
> + *memcg = recorded;
> + css_put(&mem->css);
> + goto start;
> }
>
> /*
> @@ -1589,6 +1669,9 @@ static void __mem_cgroup_cancel_charge(s
> VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
> WARN_ON_ONCE(count > INT_MAX);
> __css_put(&mem->css, (int)count);
> +
> + if (memcg_under_oom(mem))
> + memcg_oom_wake();
> }
> /* we don't need css_put for root */
> }
> @@ -2061,6 +2144,10 @@ direct_uncharge:
> res_counter_uncharge(&mem->res, PAGE_SIZE);
> if (uncharge_memsw)
> res_counter_uncharge(&mem->memsw, PAGE_SIZE);
> + /* Slow path to check OOM waiters */
> + if (!current->memcg_batch.do_batch || batch->memcg != mem)
> + if (memcg_under_oom(mem))
> + memcg_oom_wake();
> return;
> }
>
> @@ -2200,6 +2287,9 @@ void mem_cgroup_uncharge_end(void)
> res_counter_uncharge(&batch->memcg->res, batch->bytes);
> if (batch->memsw_bytes)
> res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
> +
> + if (memcg_under_oom(batch->memcg))
> + memcg_oom_wake();
> /* forget this pointer (for sanity check) */
> batch->memcg = NULL;
> }
> @@ -2408,8 +2498,7 @@ void mem_cgroup_end_migration(struct mem
>
> /*
> * A call to try to shrink memory usage on charge failure at shmem's swapin.
> - * Calling hierarchical_reclaim is not enough because we should update
> - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
> + * Calling hierarchical_reclaim is not enough because we have to hand oom-kill.
> * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
> * not from the memcg which this page would be charged to.
> * try_charge_swapin does all of these works properly.
> @@ -2440,7 +2529,8 @@ static int mem_cgroup_resize_limit(struc
> u64 memswlimit;
> int ret = 0;
> int children = mem_cgroup_count_children(memcg);
> - u64 curusage, oldusage;
> + u64 curusage, oldusage, curlimit;
> + int enlarge = 0;
>
> /*
> * For keeping hierarchical_reclaim simple, how long we should retry
> @@ -2451,6 +2541,7 @@ static int mem_cgroup_resize_limit(struc
>
> oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>
> +
> while (retry_count) {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2468,6 +2559,9 @@ static int mem_cgroup_resize_limit(struc
> mutex_unlock(&set_limit_mutex);
> break;
> }
> + curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
> + if (curlimit < val)
> + enlarge = 1;
> ret = res_counter_set_limit(&memcg->res, val);
> if (!ret) {
> if (memswlimit == val)
> @@ -2477,8 +2571,20 @@ static int mem_cgroup_resize_limit(struc
> }
> mutex_unlock(&set_limit_mutex);
>
> - if (!ret)
> + if (!ret) {
> + /*
> + * If we enlarge limit of memcg under OOM,
> + * wake up waiters.
> + */
> + if (enlarge && memcg_under_oom(memcg))
> + memcg_oom_wake();
> + break;
> + }
> + /* Under OOM ? If so, don't add more pressure. */
> + if (memcg_under_oom(memcg)) {
> + ret = -EBUSY;
> break;
> + }
>
> mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> MEM_CGROUP_RECLAIM_SHRINK);
> @@ -2497,9 +2603,10 @@ static int mem_cgroup_resize_memsw_limit
> unsigned long long val)
> {
> int retry_count;
> - u64 memlimit, oldusage, curusage;
> + u64 memlimit, oldusage, curusage, curlimit;
> int children = mem_cgroup_count_children(memcg);
> int ret = -EBUSY;
> + int enlarge;
>
> /* see mem_cgroup_resize_res_limit */
> retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
> @@ -2521,6 +2628,9 @@ static int mem_cgroup_resize_memsw_limit
> mutex_unlock(&set_limit_mutex);
> break;
> }
> + curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
> + if (curlimit < val)
> + enlarge = 1;
> ret = res_counter_set_limit(&memcg->memsw, val);
> if (!ret) {
> if (memlimit == val)
> @@ -2530,8 +2640,15 @@ static int mem_cgroup_resize_memsw_limit
> }
> mutex_unlock(&set_limit_mutex);
>
> - if (!ret)
> + if (!ret) {
> + if (enlarge && memcg_under_oom(memcg))
> + memcg_oom_wake();
> break;
> + }
> + if (memcg_under_oom(memcg)) {
> + ret = -EBUSY;
> + continue;
> + }
>
> mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> MEM_CGROUP_RECLAIM_NOSWAP |
> @@ -3859,6 +3976,9 @@ one_by_one:
> ret = -EINTR;
> break;
> }
> + /* Undo precharges if there is ongoing OOM */
> + if (memcg_under_oom(mem))
> + return -ENOMEM;
> if (!batch_count--) {
> batch_count = PRECHARGE_COUNT_AT_ONCE;
> cond_resched();
> Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
> +++ mmotm-2.6.33-Feb11/mm/oom_kill.c
> @@ -487,6 +487,9 @@ retry:
> goto retry;
> out:
> read_unlock(&tasklist_lock);
> + /* give a chance to die for selected process */
> + if (!test_thread_flag(TIF_MEMDIE))
> + schedule_timeout_uninterruptible(1);
> }
> #endif
>
> @@ -601,13 +604,6 @@ void pagefault_out_of_memory(void)
> /* Got some memory back in the last second. */
> return;
>
> - /*
> - * If this is from memcg, oom-killer is already invoked.
> - * and not worth to go system-wide-oom.
> - */
> - if (mem_cgroup_oom_called(current))
> - goto rest_and_return;
> -
> if (sysctl_panic_on_oom)
> panic("out of memory from page fault. panic_on_oom is selected.\n");
>
> @@ -619,7 +615,6 @@ void pagefault_out_of_memory(void)
> * Give "p" a good chance of killing itself before we
> * retry to allocate memory.
> */
> -rest_and_return:
> if (!test_thread_flag(TIF_MEMDIE))
> schedule_timeout_uninterruptible(1);
> }
> Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
> +++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
> return false;
> }
>
> -extern bool mem_cgroup_oom_called(struct task_struct *task);
> void mem_cgroup_update_file_mapped(struct page *page, int val);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
> return true;
> }
>
> -static inline bool mem_cgroup_oom_called(struct task_struct *task)
> -{
> - return false;
> -}
> -
> static inline int
> mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> {
>
On Fri, 26 Feb 2010 13:15:52 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Wed, 24 Feb 2010 16:59:21 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > These are dump of patches just for showing concept, what I want to do.
> > But not tested. please see if you have free time. (you can ignore ;)
> >
> > Anyway, this will HUNK to the latest mmotm, Kirill's work is merged.
> >
> > This is not related to David's work. I don't hesitate to rebase mine
> > to the mmotm if his one is merged, it's easy.
> > But I'm not sure his one goes to mm soon.
> >
> > 1st patch is for better handling oom-kill under memcg.
> It's bigger than I expected, but it basically looks good to me.
>
> > 2nd patch is oom-notifier and oom-kill-disable for memcg knob.
> >
> This feature is very atractive.
>
>
> One comment to this patch for now.
>
> > +/*
> > + * Check there are ongoing oom-kill in this hierarchy or not.
> > + * If now under oom-kill, wait for some event to restart job.
> > + */
> > +static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> > +{
> > + int oom_count = 0;
> > + DEFINE_WAIT(wait);
> > + /*
> > + * Considering hierarchy (below)
> > + * /A
> > + * /01
> > + * /02
> > + * If 01 or 02 is under oom-kill, oom-kill in A should wait.
> > + * If "A" is under oom-kill, oom-kill in 01 and 02 should wait.
> > + * (task in 01/02 can be killed.)
> > + * But if 01 is under oom-kill, 02's oom-kill is independent from it.
> > + */
> > + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> > + mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
> > + /* Am I the 1st oom killer in this sub hierarchy ? */
> > + if (oom_count == 1) {
> > + finish_wait(&memcg_oom_waitq, &wait);
> > + mem_cgroup_out_of_memory(mem, mask);
> > + mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
> I think we need call memcg_oom_wake() here. Some contexts might have slept already,
> but other callers of memcg_oom_wake() calle it after checking memcg_under_oom(),
> so if we don't wake them up here, they continue to sleep, IIUC.
>
Yes, it's problem.
My 1st expectation is "If some process is killed, uncharge will be called."
So, I didn't add memcg_oom_wake() here.
But in this patch, it's broken because I clear flag.
Maybe it's better to have 2 flags as
- a flag for "there are waiters".
- a flag for "in OOM"
Or clear "there are waiters" flag when we really call wakeup.
Please let me 2nd trial.
Thanks,
-Kame
>
> Thanks,
> Daisuke Nishimura.
>
> > I'm sorry that I'll be absent tomorrow.
> > ==
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > This is updated version of oom handling improvement om memcg.
> > But all codes are totaly renewed. This may not be sophisiticated well but
> > enough for showing idea.
> >
> > This patch does following things.
> > * set "memcg is under OOM" if somone gets into OOM under a memcg.
> > like zone's OOM lock, tree-of-memcg is marked as under OOM.
> > By this. simlutabeous OOM kill in a tree will not happen.
> >
> > * When other threads try to reclaim memory or call oom-kill, it
> > checks its own target memcg is under oom or not. If someone
> > calls oom-killer already, the thread will be queued to waitq.
> >
> > * At some event which makes room for new memory, threads on waitq
> > are waken up.
> > ** A page (or chunk of pages) are unchraged.
> > ** A task is moved.
> > ** limit is enlarged.
> >
> > And this patch also allows to check "current's memcg is changed or not"
> > while charging.
> >
> > Considering what admin/daemon can do when it notice OOM,
> > * kill a process
> > * move a process (to other cgroup which has free area)
> > * remove a file (on tmpfs or some)
> > * enlarge limit
> > I think all chances for wakeing up waiters are covered by these.
> >
> > After this patch, memcg's accounting will not fail in usual path.
> > If all tasks are OOM_DISABLE, memcg may hang. But admin can have
> > several options described in above. So, oom notifier+freeze should be
> > implemented.
> >
> > TODO: maybe not difficult.
> > * Add oom notifier. (can reuse memory.threashold ?)
> > * Add a switch for oom-freeze rather than oom-kill.
> >
> > Cc: David Rientjes <[email protected]>
> > Cc: Balbir Singh <[email protected]>
> > Cc: Daisuke Nishimura <[email protected]>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > include/linux/memcontrol.h | 6 -
> > mm/memcontrol.c | 208 +++++++++++++++++++++++++++++++++++----------
> > mm/oom_kill.c | 11 --
> > 3 files changed, 167 insertions(+), 58 deletions(-)
> >
> > Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> > +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> > @@ -200,7 +200,6 @@ struct mem_cgroup {
> > * Should the accounting and control be hierarchical, per subtree?
> > */
> > bool use_hierarchy;
> > - unsigned long last_oom_jiffies;
> > atomic_t refcnt;
> >
> > unsigned int swappiness;
> > @@ -223,6 +222,8 @@ struct mem_cgroup {
> > */
> > unsigned long move_charge_at_immigrate;
> >
> > + /* counting ongoing OOM requests under sub hierarchy */
> > + atomic_t oom_count;
> > /*
> > * percpu counter.
> > */
> > @@ -1096,6 +1097,89 @@ done:
> > }
> >
> > /*
> > + * set/under memcg_oom counting is done under mutex.
> > + */
> > +static DEFINE_MUTEX(memcg_oom_mutex);
> > +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
> > +
> > +static int set_memcg_oom_cb(struct mem_cgroup *mem, void *data)
> > +{
> > + int *max_count = (int*)data;
> > + int count = atomic_inc_return(&mem->oom_count);
> > + if (*max_count < count)
> > + *max_count = count;
> > + return 0;
> > +}
> > +
> > +static int unset_memcg_oom_cb(struct mem_cgroup *mem, void *data)
> > +{
> > + atomic_set(&mem->oom_count, 0);
> > + return 0;
> > +}
> > +
> > +static bool memcg_under_oom(struct mem_cgroup *mem)
> > +{
> > + if (atomic_read(&mem->oom_count))
> > + return true;
> > + return false;
> > +}
> > +
> > +static void memcg_oom_wait(struct mem_cgroup *mem)
> > +{
> > + DEFINE_WAIT(wait);
> > +
> > + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> > + if (memcg_under_oom(mem))
> > + schedule();
> > + finish_wait(&memcg_oom_waitq, &wait);
> > +}
> > +
> > +static void memcg_oom_wake(void)
> > +{
> > + /* This may wake up unnecessary tasks..but it's not big problem */
> > + wake_up_all(&memcg_oom_waitq);
> > +}
> > +/*
> > + * Check there are ongoing oom-kill in this hierarchy or not.
> > + * If now under oom-kill, wait for some event to restart job.
> > + */
> > +static bool memcg_handle_oom(struct mem_cgroup *mem, gfp_t mask)
> > +{
> > + int oom_count = 0;
> > + DEFINE_WAIT(wait);
> > + /*
> > + * Considering hierarchy (below)
> > + * /A
> > + * /01
> > + * /02
> > + * If 01 or 02 is under oom-kill, oom-kill in A should wait.
> > + * If "A" is under oom-kill, oom-kill in 01 and 02 should wait.
> > + * (task in 01/02 can be killed.)
> > + * But if 01 is under oom-kill, 02's oom-kill is independent from it.
> > + */
> > + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
> > + mem_cgroup_walk_tree(mem, &oom_count, set_memcg_oom_cb);
> > + /* Am I the 1st oom killer in this sub hierarchy ? */
> > + if (oom_count == 1) {
> > + finish_wait(&memcg_oom_waitq, &wait);
> > + mem_cgroup_out_of_memory(mem, mask);
> > + mem_cgroup_walk_tree(mem, NULL, unset_memcg_oom_cb);
> > + } else {
> > + /*
> > + * Wakeup is called when
> > + * 1. pages are uncharged. (by killed, or removal of a file)
> > + * 2. limit is enlarged.
> > + * 3. a task is moved.
> > + */
> > + schedule();
> > + finish_wait(&memcg_oom_waitq, &wait);
> > + }
> > + if (test_thread_flag(TIF_MEMDIE))
> > + return false;
> > + return true;
> > +}
> > +
> > +/*
> > * This function returns the number of memcg under hierarchy tree. Returns
> > * 1(self count) if no children.
> > */
> > @@ -1234,34 +1318,6 @@ static int mem_cgroup_hierarchical_recla
> > return total;
> > }
> >
> > -bool mem_cgroup_oom_called(struct task_struct *task)
> > -{
> > - bool ret = false;
> > - struct mem_cgroup *mem;
> > - struct mm_struct *mm;
> > -
> > - rcu_read_lock();
> > - mm = task->mm;
> > - if (!mm)
> > - mm = &init_mm;
> > - mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> > - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
> > - ret = true;
> > - rcu_read_unlock();
> > - return ret;
> > -}
> > -
> > -static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
> > -{
> > - mem->last_oom_jiffies = jiffies;
> > - return 0;
> > -}
> > -
> > -static void record_last_oom(struct mem_cgroup *mem)
> > -{
> > - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
> > -}
> > -
> > /*
> > * Currently used to update mapped file statistics, but the routine can be
> > * generalized to update other statistics as well.
> > @@ -1419,6 +1475,7 @@ static int __cpuinit memcg_stock_cpu_cal
> > return NOTIFY_OK;
> > }
> >
> > +
> > /*
> > * Unlike exported interface, "oom" parameter is added. if oom==true,
> > * oom-killer can be invoked.
> > @@ -1427,17 +1484,21 @@ static int __mem_cgroup_try_charge(struc
> > gfp_t gfp_mask, struct mem_cgroup **memcg,
> > bool oom, struct page *page)
> > {
> > - struct mem_cgroup *mem, *mem_over_limit;
> > - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > + struct mem_cgroup *mem, *mem_over_limit, *recorded;
> > + int nr_retries, csize;
> > struct res_counter *fail_res;
> > - int csize = CHARGE_SIZE;
> > +
> > +start:
> > + nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > + recorded = *memcg;
> > + csize = CHARGE_SIZE;
> > + mem = NULL;
> >
> > if (unlikely(test_thread_flag(TIF_MEMDIE))) {
> > /* Don't account this! */
> > *memcg = NULL;
> > return 0;
> > }
> > -
> > /*
> > * We always charge the cgroup the mm_struct belongs to.
> > * The mm_struct's mem_cgroup changes on task migration if the
> > @@ -1489,6 +1550,12 @@ static int __mem_cgroup_try_charge(struc
> > }
> > if (!(gfp_mask & __GFP_WAIT))
> > goto nomem;
> > + /* already in OOM ? */
> > + if (memcg_under_oom(mem_over_limit)) {
> > + /* Don't add too much pressure to the host */
> > + memcg_oom_wait(mem_over_limit);
> > + goto retry;
> > + }
> >
> > ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> > gfp_mask, flags);
> > @@ -1549,11 +1616,15 @@ static int __mem_cgroup_try_charge(struc
> > }
> >
> > if (!nr_retries--) {
> > - if (oom) {
> > - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> > - record_last_oom(mem_over_limit);
> > - }
> > - goto nomem;
> > +
> > + if (!oom)
> > + goto nomem;
> > + /* returnes false if current is killed */
> > + if (memcg_handle_oom(mem_over_limit, gfp_mask))
> > + goto retry;
> > + /* For smooth oom-kill of current, return 0 */
> > + css_put(&mem->css);
> > + return 0;
> > }
> > }
> > if (csize > PAGE_SIZE)
> > @@ -1572,6 +1643,15 @@ done:
> > nomem:
> > css_put(&mem->css);
> > return -ENOMEM;
> > +
> > +retry:
> > + /*
> > + * current's mem_cgroup can be moved while we're waiting for
> > + * memory reclaim or OOM-Kill.
> > + */
> > + *memcg = recorded;
> > + css_put(&mem->css);
> > + goto start;
> > }
> >
> > /*
> > @@ -1589,6 +1669,9 @@ static void __mem_cgroup_cancel_charge(s
> > VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
> > WARN_ON_ONCE(count > INT_MAX);
> > __css_put(&mem->css, (int)count);
> > +
> > + if (memcg_under_oom(mem))
> > + memcg_oom_wake();
> > }
> > /* we don't need css_put for root */
> > }
> > @@ -2061,6 +2144,10 @@ direct_uncharge:
> > res_counter_uncharge(&mem->res, PAGE_SIZE);
> > if (uncharge_memsw)
> > res_counter_uncharge(&mem->memsw, PAGE_SIZE);
> > + /* Slow path to check OOM waiters */
> > + if (!current->memcg_batch.do_batch || batch->memcg != mem)
> > + if (memcg_under_oom(mem))
> > + memcg_oom_wake();
> > return;
> > }
> >
> > @@ -2200,6 +2287,9 @@ void mem_cgroup_uncharge_end(void)
> > res_counter_uncharge(&batch->memcg->res, batch->bytes);
> > if (batch->memsw_bytes)
> > res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
> > +
> > + if (memcg_under_oom(batch->memcg))
> > + memcg_oom_wake();
> > /* forget this pointer (for sanity check) */
> > batch->memcg = NULL;
> > }
> > @@ -2408,8 +2498,7 @@ void mem_cgroup_end_migration(struct mem
> >
> > /*
> > * A call to try to shrink memory usage on charge failure at shmem's swapin.
> > - * Calling hierarchical_reclaim is not enough because we should update
> > - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
> > + * Calling hierarchical_reclaim is not enough because we have to hand oom-kill.
> > * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
> > * not from the memcg which this page would be charged to.
> > * try_charge_swapin does all of these works properly.
> > @@ -2440,7 +2529,8 @@ static int mem_cgroup_resize_limit(struc
> > u64 memswlimit;
> > int ret = 0;
> > int children = mem_cgroup_count_children(memcg);
> > - u64 curusage, oldusage;
> > + u64 curusage, oldusage, curlimit;
> > + int enlarge = 0;
> >
> > /*
> > * For keeping hierarchical_reclaim simple, how long we should retry
> > @@ -2451,6 +2541,7 @@ static int mem_cgroup_resize_limit(struc
> >
> > oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
> >
> > +
> > while (retry_count) {
> > if (signal_pending(current)) {
> > ret = -EINTR;
> > @@ -2468,6 +2559,9 @@ static int mem_cgroup_resize_limit(struc
> > mutex_unlock(&set_limit_mutex);
> > break;
> > }
> > + curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
> > + if (curlimit < val)
> > + enlarge = 1;
> > ret = res_counter_set_limit(&memcg->res, val);
> > if (!ret) {
> > if (memswlimit == val)
> > @@ -2477,8 +2571,20 @@ static int mem_cgroup_resize_limit(struc
> > }
> > mutex_unlock(&set_limit_mutex);
> >
> > - if (!ret)
> > + if (!ret) {
> > + /*
> > + * If we enlarge limit of memcg under OOM,
> > + * wake up waiters.
> > + */
> > + if (enlarge && memcg_under_oom(memcg))
> > + memcg_oom_wake();
> > + break;
> > + }
> > + /* Under OOM ? If so, don't add more pressure. */
> > + if (memcg_under_oom(memcg)) {
> > + ret = -EBUSY;
> > break;
> > + }
> >
> > mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> > MEM_CGROUP_RECLAIM_SHRINK);
> > @@ -2497,9 +2603,10 @@ static int mem_cgroup_resize_memsw_limit
> > unsigned long long val)
> > {
> > int retry_count;
> > - u64 memlimit, oldusage, curusage;
> > + u64 memlimit, oldusage, curusage, curlimit;
> > int children = mem_cgroup_count_children(memcg);
> > int ret = -EBUSY;
> > + int enlarge;
> >
> > /* see mem_cgroup_resize_res_limit */
> > retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
> > @@ -2521,6 +2628,9 @@ static int mem_cgroup_resize_memsw_limit
> > mutex_unlock(&set_limit_mutex);
> > break;
> > }
> > + curlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
> > + if (curlimit < val)
> > + enlarge = 1;
> > ret = res_counter_set_limit(&memcg->memsw, val);
> > if (!ret) {
> > if (memlimit == val)
> > @@ -2530,8 +2640,15 @@ static int mem_cgroup_resize_memsw_limit
> > }
> > mutex_unlock(&set_limit_mutex);
> >
> > - if (!ret)
> > + if (!ret) {
> > + if (enlarge && memcg_under_oom(memcg))
> > + memcg_oom_wake();
> > break;
> > + }
> > + if (memcg_under_oom(memcg)) {
> > + ret = -EBUSY;
> > + continue;
> > + }
> >
> > mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> > MEM_CGROUP_RECLAIM_NOSWAP |
> > @@ -3859,6 +3976,9 @@ one_by_one:
> > ret = -EINTR;
> > break;
> > }
> > + /* Undo precharges if there is ongoing OOM */
> > + if (memcg_under_oom(mem))
> > + return -ENOMEM;
> > if (!batch_count--) {
> > batch_count = PRECHARGE_COUNT_AT_ONCE;
> > cond_resched();
> > Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
> > +++ mmotm-2.6.33-Feb11/mm/oom_kill.c
> > @@ -487,6 +487,9 @@ retry:
> > goto retry;
> > out:
> > read_unlock(&tasklist_lock);
> > + /* give a chance to die for selected process */
> > + if (!test_thread_flag(TIF_MEMDIE))
> > + schedule_timeout_uninterruptible(1);
> > }
> > #endif
> >
> > @@ -601,13 +604,6 @@ void pagefault_out_of_memory(void)
> > /* Got some memory back in the last second. */
> > return;
> >
> > - /*
> > - * If this is from memcg, oom-killer is already invoked.
> > - * and not worth to go system-wide-oom.
> > - */
> > - if (mem_cgroup_oom_called(current))
> > - goto rest_and_return;
> > -
> > if (sysctl_panic_on_oom)
> > panic("out of memory from page fault. panic_on_oom is selected.\n");
> >
> > @@ -619,7 +615,6 @@ void pagefault_out_of_memory(void)
> > * Give "p" a good chance of killing itself before we
> > * retry to allocate memory.
> > */
> > -rest_and_return:
> > if (!test_thread_flag(TIF_MEMDIE))
> > schedule_timeout_uninterruptible(1);
> > }
> > Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
> > +++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> > @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
> > return false;
> > }
> >
> > -extern bool mem_cgroup_oom_called(struct task_struct *task);
> > void mem_cgroup_update_file_mapped(struct page *page, int val);
> > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> > gfp_t gfp_mask, int nid,
> > @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
> > return true;
> > }
> >
> > -static inline bool mem_cgroup_oom_called(struct task_struct *task)
> > -{
> > - return false;
> > -}
> > -
> > static inline int
> > mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> > {
> >
>
On Fri, 26 Feb 2010 13:15:52 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Wed, 24 Feb 2010 16:59:21 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > These are dump of patches just for showing concept, what I want to do.
> > But not tested. please see if you have free time. (you can ignore ;)
> >
> > Anyway, this will HUNK to the latest mmotm, Kirill's work is merged.
> >
> > This is not related to David's work. I don't hesitate to rebase mine
> > to the mmotm if his one is merged, it's easy.
> > But I'm not sure his one goes to mm soon.
> >
> > 1st patch is for better handling oom-kill under memcg.
> It's bigger than I expected, but it basically looks good to me.
>
BTW, do you think we need quick fix ? I can't think of a very easy/small fix
which is very correct...
Thanks,
-Kame
On Fri, 26 Feb 2010 14:23:39 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Fri, 26 Feb 2010 13:15:52 +0900
> Daisuke Nishimura <[email protected]> wrote:
>
> > On Wed, 24 Feb 2010 16:59:21 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > > These are dump of patches just for showing concept, what I want to do.
> > > But not tested. please see if you have free time. (you can ignore ;)
> > >
> > > Anyway, this will HUNK to the latest mmotm, Kirill's work is merged.
> > >
> > > This is not related to David's work. I don't hesitate to rebase mine
> > > to the mmotm if his one is merged, it's easy.
> > > But I'm not sure his one goes to mm soon.
> > >
> > > 1st patch is for better handling oom-kill under memcg.
> > It's bigger than I expected, but it basically looks good to me.
> >
>
> BTW, do you think we need quick fix ? I can't think of a very easy/small fix
> which is very correct...
To be honest, yes.
IMHO, casing global oom because of memcg's oom is a very bad behavior
in the sence of resource isolation. But I agree it's hard to fix in simple way,
so leave it as it is for now..
hmm.. I must admit that I've not done enough oom test under very high pressure.
Thanks,
Daisuke Nishimura.
On Fri, 26 Feb 2010 14:47:52 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Fri, 26 Feb 2010 14:23:39 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > > > 1st patch is for better handling oom-kill under memcg.
> > > It's bigger than I expected, but it basically looks good to me.
> > >
> >
> > BTW, do you think we need quick fix ? I can't think of a very easy/small fix
> > which is very correct...
> To be honest, yes.
Okay. following is a candidate we can have. This will be incomplete until
we have oom notifier for memcg but may be better than miss-firing
page_fault_out_of_memory. Nishimura-san, how do you think this ?
(Added Andrew to CC.)
==
From: KAMEZAWA Hiroyuk <[email protected]>
In current page-fault code,
handle_mm_fault()
-> ...
-> mem_cgroup_charge()
-> map page or handle error.
-> check return code.
If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory()
is called. But if it's caused by memcg, OOM should have been already
invoked.
Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6
That patch records last_oom_jiffies for memcg's sub-hierarchy and
prevents page_fault_out_of_memory from being invoked in near future.
But Nishimura-san reported that check by jiffies is not enough
when the system is terribly heavy.
This patch changes memcg's oom logic as.
* If memcg causes OOM-kill, continue to retry.
* memcg hangs when there are no task to be killed.
* remove jiffies check which is used now.
TODO:
* add oom notifier for informing management daemon.
* more clever sleep logic for avoiding to use much CPU.
Signed-off-by: KAMEZAWA Hiroyuk <[email protected]>
---
include/linux/memcontrol.h | 6 ----
mm/memcontrol.c | 56 ++++++++++++++++-----------------------------
mm/oom_kill.c | 28 ++++++++++++----------
3 files changed, 37 insertions(+), 53 deletions(-)
Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
+++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
return false;
}
-extern bool mem_cgroup_oom_called(struct task_struct *task);
void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
return true;
}
-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
- return false;
-}
-
static inline int
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -200,7 +200,6 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long last_oom_jiffies;
atomic_t refcnt;
unsigned int swappiness;
@@ -1234,34 +1233,6 @@ static int mem_cgroup_hierarchical_recla
return total;
}
-bool mem_cgroup_oom_called(struct task_struct *task)
-{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
-
- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;
-}
-
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
-{
- mem->last_oom_jiffies = jiffies;
- return 0;
-}
-
-static void record_last_oom(struct mem_cgroup *mem)
-{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
-}
-
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
@@ -1549,11 +1520,27 @@ static int __mem_cgroup_try_charge(struc
}
if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
+ if (!oom)
+ goto nomem;
+ mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+ /*
+ * If killed someone, we can retry. If killed myself,
+ * allow to go ahead in force.
+ *
+ * Note: There may be a case we can never kill any
+ * processes under us.(by OOM_DISABLE) But, in that
+ * case, if we return -ENOMEM, pagefault_out_of_memory
+ * will kill someone innocent, out of this memcg.
+ * So, what we can do is just try harder..
+ */
+ if (test_thread_flag(TIF_MEMDIE)) {
+ css_put(&mem->css);
+ *memcg = NULL;
+ return 0;
}
- goto nomem;
+ /* give chance to run */
+ schedule_timeout(1);
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
}
if (csize > PAGE_SIZE)
@@ -2408,8 +2395,7 @@ void mem_cgroup_end_migration(struct mem
/*
* A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * Calling hierarchical_reclaim is not enough. We may have to call OOM.
* Moreover considering hierarchy, we should reclaim from the mem_over_limit,
* not from the memcg which this page would be charged to.
* try_charge_swapin does all of these works properly.
Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Feb11/mm/oom_kill.c
@@ -466,27 +466,39 @@ static int oom_kill_process(struct task_
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/*
+ * When select_bad_process() can't find proper process and we failed to
+ * kill current, returns 0 as faiulre of OOM-kill. Otherwise, returns 1.
+ */
void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
{
unsigned long points = 0;
struct task_struct *p;
+ int not_found = 0;
if (sysctl_panic_on_oom == 2)
panic("out of memory(memcg). panic_on_oom is selected.\n");
read_lock(&tasklist_lock);
retry:
+ not_found = 0;
p = select_bad_process(&points, mem);
if (PTR_ERR(p) == -1UL)
goto out;
-
- if (!p)
+ if (!p) {
+ not_found = 1;
p = current;
+ printk(KERN_ERR "It seems there are no killable processes "
+ "under memcg in OOM. Try to kill current\n");
+ }
if (oom_kill_process(p, gfp_mask, 0, points, mem,
- "Memory cgroup out of memory"))
- goto retry;
+ "Memory cgroup out of memory")) {
+ if (!not_found) /* some race with OOM_DISABLE etc ? */
+ goto retry;
+ }
out:
read_unlock(&tasklist_lock);
+ /* Even if we don't kill any, give chance to try to recalim more */
}
#endif
@@ -601,13 +613,6 @@ void pagefault_out_of_memory(void)
/* Got some memory back in the last second. */
return;
- /*
- * If this is from memcg, oom-killer is already invoked.
- * and not worth to go system-wide-oom.
- */
- if (mem_cgroup_oom_called(current))
- goto rest_and_return;
-
if (sysctl_panic_on_oom)
panic("out of memory from page fault. panic_on_oom is selected.\n");
@@ -619,7 +624,6 @@ void pagefault_out_of_memory(void)
* Give "p" a good chance of killing itself before we
* retry to allocate memory.
*/
-rest_and_return:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
On Fri, 26 Feb 2010 16:17:52 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Fri, 26 Feb 2010 14:47:52 +0900
> Daisuke Nishimura <[email protected]> wrote:
>
> > On Fri, 26 Feb 2010 14:23:39 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> > > > > 1st patch is for better handling oom-kill under memcg.
> > > > It's bigger than I expected, but it basically looks good to me.
> > > >
> > >
> > > BTW, do you think we need quick fix ? I can't think of a very easy/small fix
> > > which is very correct...
> > To be honest, yes.
>
> Okay. following is a candidate we can have. This will be incomplete until
> we have oom notifier for memcg but may be better than miss-firing
> page_fault_out_of_memory. Nishimura-san, how do you think this ?
> (Added Andrew to CC.)
>
Thank you very much for your patch.
I agree it's enough for quick fix, and it seems to work.
> ==
>
> From: KAMEZAWA Hiroyuk <[email protected]>
>
> In current page-fault code,
>
> handle_mm_fault()
> -> ...
> -> mem_cgroup_charge()
> -> map page or handle error.
> -> check return code.
>
> If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory()
> is called. But if it's caused by memcg, OOM should have been already
> invoked.
> Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6
>
> That patch records last_oom_jiffies for memcg's sub-hierarchy and
> prevents page_fault_out_of_memory from being invoked in near future.
>
> But Nishimura-san reported that check by jiffies is not enough
> when the system is terribly heavy.
>
> This patch changes memcg's oom logic as.
> * If memcg causes OOM-kill, continue to retry.
> * memcg hangs when there are no task to be killed.
IIUC, this behavior is the same as current behavior. mem_cgroup_out_of_memory()
hangs if all of the tasks in the cgroup are OOM_DISABLE'ed.
> * remove jiffies check which is used now.
>
> TODO:
> * add oom notifier for informing management daemon.
> * more clever sleep logic for avoiding to use much CPU.
>
> Signed-off-by: KAMEZAWA Hiroyuk <[email protected]>
> ---
> include/linux/memcontrol.h | 6 ----
> mm/memcontrol.c | 56 ++++++++++++++++-----------------------------
> mm/oom_kill.c | 28 ++++++++++++----------
> 3 files changed, 37 insertions(+), 53 deletions(-)
>
> Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
> +++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
> return false;
> }
>
> -extern bool mem_cgroup_oom_called(struct task_struct *task);
> void mem_cgroup_update_file_mapped(struct page *page, int val);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
> return true;
> }
>
> -static inline bool mem_cgroup_oom_called(struct task_struct *task)
> -{
> - return false;
> -}
> -
> static inline int
> mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> {
> Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> @@ -200,7 +200,6 @@ struct mem_cgroup {
> * Should the accounting and control be hierarchical, per subtree?
> */
> bool use_hierarchy;
> - unsigned long last_oom_jiffies;
> atomic_t refcnt;
>
> unsigned int swappiness;
> @@ -1234,34 +1233,6 @@ static int mem_cgroup_hierarchical_recla
> return total;
> }
>
> -bool mem_cgroup_oom_called(struct task_struct *task)
> -{
> - bool ret = false;
> - struct mem_cgroup *mem;
> - struct mm_struct *mm;
> -
> - rcu_read_lock();
> - mm = task->mm;
> - if (!mm)
> - mm = &init_mm;
> - mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
> - ret = true;
> - rcu_read_unlock();
> - return ret;
> -}
> -
> -static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
> -{
> - mem->last_oom_jiffies = jiffies;
> - return 0;
> -}
> -
> -static void record_last_oom(struct mem_cgroup *mem)
> -{
> - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
> -}
> -
> /*
> * Currently used to update mapped file statistics, but the routine can be
> * generalized to update other statistics as well.
> @@ -1549,11 +1520,27 @@ static int __mem_cgroup_try_charge(struc
> }
>
> if (!nr_retries--) {
> - if (oom) {
> - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> - record_last_oom(mem_over_limit);
> + if (!oom)
> + goto nomem;
> + mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> + /*
> + * If killed someone, we can retry. If killed myself,
> + * allow to go ahead in force.
> + *
> + * Note: There may be a case we can never kill any
> + * processes under us.(by OOM_DISABLE) But, in that
> + * case, if we return -ENOMEM, pagefault_out_of_memory
> + * will kill someone innocent, out of this memcg.
> + * So, what we can do is just try harder..
> + */
> + if (test_thread_flag(TIF_MEMDIE)) {
Is "if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))" better
to accept SIGKILL ?
> + css_put(&mem->css);
> + *memcg = NULL;
> + return 0;
> }
> - goto nomem;
> + /* give chance to run */
> + schedule_timeout(1);
> + nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> }
> }
> if (csize > PAGE_SIZE)
> @@ -2408,8 +2395,7 @@ void mem_cgroup_end_migration(struct mem
>
> /*
> * A call to try to shrink memory usage on charge failure at shmem's swapin.
> - * Calling hierarchical_reclaim is not enough because we should update
> - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
> + * Calling hierarchical_reclaim is not enough. We may have to call OOM.
> * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
> * not from the memcg which this page would be charged to.
> * try_charge_swapin does all of these works properly.
> Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
> ===================================================================
> --- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
> +++ mmotm-2.6.33-Feb11/mm/oom_kill.c
> @@ -466,27 +466,39 @@ static int oom_kill_process(struct task_
> }
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +/*
> + * When select_bad_process() can't find proper process and we failed to
> + * kill current, returns 0 as faiulre of OOM-kill. Otherwise, returns 1.
> + */
hmm, what function does this comment describe ?
mem_cgroup_out_of_memory() returns void.
Thanks,
Daisuke Nishimura.
> void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
> {
> unsigned long points = 0;
> struct task_struct *p;
> + int not_found = 0;
>
> if (sysctl_panic_on_oom == 2)
> panic("out of memory(memcg). panic_on_oom is selected.\n");
> read_lock(&tasklist_lock);
> retry:
> + not_found = 0;
> p = select_bad_process(&points, mem);
> if (PTR_ERR(p) == -1UL)
> goto out;
> -
> - if (!p)
> + if (!p) {
> + not_found = 1;
> p = current;
> + printk(KERN_ERR "It seems there are no killable processes "
> + "under memcg in OOM. Try to kill current\n");
> + }
>
> if (oom_kill_process(p, gfp_mask, 0, points, mem,
> - "Memory cgroup out of memory"))
> - goto retry;
> + "Memory cgroup out of memory")) {
> + if (!not_found) /* some race with OOM_DISABLE etc ? */
> + goto retry;
> + }
> out:
> read_unlock(&tasklist_lock);
> + /* Even if we don't kill any, give chance to try to recalim more */
> }
> #endif
>
> @@ -601,13 +613,6 @@ void pagefault_out_of_memory(void)
> /* Got some memory back in the last second. */
> return;
>
> - /*
> - * If this is from memcg, oom-killer is already invoked.
> - * and not worth to go system-wide-oom.
> - */
> - if (mem_cgroup_oom_called(current))
> - goto rest_and_return;
> -
> if (sysctl_panic_on_oom)
> panic("out of memory from page fault. panic_on_oom is selected.\n");
>
> @@ -619,7 +624,6 @@ void pagefault_out_of_memory(void)
> * Give "p" a good chance of killing itself before we
> * retry to allocate memory.
> */
> -rest_and_return:
> if (!test_thread_flag(TIF_MEMDIE))
> schedule_timeout_uninterruptible(1);
> }
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Fri, 26 Feb 2010 21:30:15 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Fri, 26 Feb 2010 16:17:52 +0900
> KAMEZAWA Hiroyuki <[email protected]> wrote:
> > Okay. following is a candidate we can have. This will be incomplete until
> > we have oom notifier for memcg but may be better than miss-firing
> > page_fault_out_of_memory. Nishimura-san, how do you think this ?
> > (Added Andrew to CC.)
> >
> Thank you very much for your patch.
> I agree it's enough for quick fix, and it seems to work.
>
> > ==
> >
> > From: KAMEZAWA Hiroyuk <[email protected]>
> >
> > In current page-fault code,
> >
> > handle_mm_fault()
> > -> ...
> > -> mem_cgroup_charge()
> > -> map page or handle error.
> > -> check return code.
> >
> > If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory()
> > is called. But if it's caused by memcg, OOM should have been already
> > invoked.
> > Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6
> >
> > That patch records last_oom_jiffies for memcg's sub-hierarchy and
> > prevents page_fault_out_of_memory from being invoked in near future.
> >
> > But Nishimura-san reported that check by jiffies is not enough
> > when the system is terribly heavy.
> >
> > This patch changes memcg's oom logic as.
> > * If memcg causes OOM-kill, continue to retry.
> > * memcg hangs when there are no task to be killed.
> IIUC, this behavior is the same as current behavior. mem_cgroup_out_of_memory()
> hangs if all of the tasks in the cgroup are OOM_DISABLE'ed.
>
Ah, yes. I'll add that comment.
> > * remove jiffies check which is used now.
> >
> > TODO:
> > * add oom notifier for informing management daemon.
> > * more clever sleep logic for avoiding to use much CPU.
> >
> > Signed-off-by: KAMEZAWA Hiroyuk <[email protected]>
> > ---
> > include/linux/memcontrol.h | 6 ----
> > mm/memcontrol.c | 56 ++++++++++++++++-----------------------------
> > mm/oom_kill.c | 28 ++++++++++++----------
> > 3 files changed, 37 insertions(+), 53 deletions(-)
> >
> > Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
> > +++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
> > @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
> > return false;
> > }
> >
> > -extern bool mem_cgroup_oom_called(struct task_struct *task);
> > void mem_cgroup_update_file_mapped(struct page *page, int val);
> > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> > gfp_t gfp_mask, int nid,
> > @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
> > return true;
> > }
> >
> > -static inline bool mem_cgroup_oom_called(struct task_struct *task)
> > -{
> > - return false;
> > -}
> > -
> > static inline int
> > mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> > {
> > Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
> > +++ mmotm-2.6.33-Feb11/mm/memcontrol.c
> > @@ -200,7 +200,6 @@ struct mem_cgroup {
> > * Should the accounting and control be hierarchical, per subtree?
> > */
> > bool use_hierarchy;
> > - unsigned long last_oom_jiffies;
> > atomic_t refcnt;
> >
> > unsigned int swappiness;
> > @@ -1234,34 +1233,6 @@ static int mem_cgroup_hierarchical_recla
> > return total;
> > }
> >
> > -
> > /*
> > * Currently used to update mapped file statistics, but the routine can be
> > * generalized to update other statistics as well.
> > @@ -1549,11 +1520,27 @@ static int __mem_cgroup_try_charge(struc
> > }
> >
> > if (!nr_retries--) {
> > - if (oom) {
> > - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> > - record_last_oom(mem_over_limit);
> > + if (!oom)
> > + goto nomem;
> > + mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
> > + /*
> > + * If killed someone, we can retry. If killed myself,
> > + * allow to go ahead in force.
> > + *
> > + * Note: There may be a case we can never kill any
> > + * processes under us.(by OOM_DISABLE) But, in that
> > + * case, if we return -ENOMEM, pagefault_out_of_memory
> > + * will kill someone innocent, out of this memcg.
> > + * So, what we can do is just try harder..
> > + */
> > + if (test_thread_flag(TIF_MEMDIE)) {
> Is "if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))" better
> to accept SIGKILL ?
>
Hmm, ok, will add that. One problem of TIF_MEMDIE is that it's added to a thread
not to a process.
> > + css_put(&mem->css);
> > + *memcg = NULL;
> > + return 0;
> > }
> > - goto nomem;
> > + /* give chance to run */
> > + schedule_timeout(1);
> > + nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > }
> > }
> > if (csize > PAGE_SIZE)
> > @@ -2408,8 +2395,7 @@ void mem_cgroup_end_migration(struct mem
> >
> > /*
> > * A call to try to shrink memory usage on charge failure at shmem's swapin.
> > - * Calling hierarchical_reclaim is not enough because we should update
> > - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
> > + * Calling hierarchical_reclaim is not enough. We may have to call OOM.
> > * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
> > * not from the memcg which this page would be charged to.
> > * try_charge_swapin does all of these works properly.
> > Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
> > ===================================================================
> > --- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
> > +++ mmotm-2.6.33-Feb11/mm/oom_kill.c
> > @@ -466,27 +466,39 @@ static int oom_kill_process(struct task_
> > }
> >
> > #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> > +/*
> > + * When select_bad_process() can't find proper process and we failed to
> > + * kill current, returns 0 as faiulre of OOM-kill. Otherwise, returns 1.
> > + */
> hmm, what function does this comment describe ?
> mem_cgroup_out_of_memory() returns void.
>
Ahhh, I'll remove this. This is garbage.
I'll clean up this and post as quick-fix.
Thanks,
-Kame
>
> Thanks,
> Daisuke Nishimura.
>
> > void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
> > {
> > unsigned long points = 0;
> > struct task_struct *p;
> > + int not_found = 0;
> >
> > if (sysctl_panic_on_oom == 2)
> > panic("out of memory(memcg). panic_on_oom is selected.\n");
> > read_lock(&tasklist_lock);
> > retry:
> > + not_found = 0;
> > p = select_bad_process(&points, mem);
> > if (PTR_ERR(p) == -1UL)
> > goto out;
> > -
> > - if (!p)
> > + if (!p) {
> > + not_found = 1;
> > p = current;
> > + printk(KERN_ERR "It seems there are no killable processes "
> > + "under memcg in OOM. Try to kill current\n");
> > + }
> >
> > if (oom_kill_process(p, gfp_mask, 0, points, mem,
> > - "Memory cgroup out of memory"))
> > - goto retry;
> > + "Memory cgroup out of memory")) {
> > + if (!not_found) /* some race with OOM_DISABLE etc ? */
> > + goto retry;
> > + }
> > out:
> > read_unlock(&tasklist_lock);
> > + /* Even if we don't kill any, give chance to try to recalim more */
> > }
> > #endif
> >
> > @@ -601,13 +613,6 @@ void pagefault_out_of_memory(void)
> > /* Got some memory back in the last second. */
> > return;
> >
> > - /*
> > - * If this is from memcg, oom-killer is already invoked.
> > - * and not worth to go system-wide-oom.
> > - */
> > - if (mem_cgroup_oom_called(current))
> > - goto rest_and_return;
> > -
> > if (sysctl_panic_on_oom)
> > panic("out of memory from page fault. panic_on_oom is selected.\n");
> >
> > @@ -619,7 +624,6 @@ void pagefault_out_of_memory(void)
> > * Give "p" a good chance of killing itself before we
> > * retry to allocate memory.
> > */
> > -rest_and_return:
> > if (!test_thread_flag(TIF_MEMDIE))
> > schedule_timeout_uninterruptible(1);
> > }
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> >
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>