2015-06-18 23:00:21

by David Rientjes

[permalink] [raw]
Subject: [patch 1/3] mm, oom: organize oom context into struct

There are essential elements to an oom context that are passed around to
multiple functions.

Organize these elements into a new struct, struct oom_context, that
specifies the context for an oom condition.

This patch introduces no functional change.

Signed-off-by: David Rientjes <[email protected]>
---
drivers/tty/sysrq.c | 12 +++++-
include/linux/oom.h | 25 +++++++-----
mm/memcontrol.c | 16 +++++---
mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
mm/page_alloc.c | 10 ++++-
5 files changed, 98 insertions(+), 80 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {

static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = 0,
+ .force_kill = true,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,6 +12,14 @@ struct notifier_block;
struct mem_cgroup;
struct task_struct;

+struct oom_control {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ gfp_t gfp_mask;
+ int order;
+ bool force_kill;
+};
+
/*
* Types of limitations to the nodes from which allocations may occur
*/
@@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,

extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);

-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);

-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);

-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}

- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;

/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;

- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;

/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}

/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;

if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif

-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;

/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (!oc->force_kill)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && !oc->force_kill)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;

- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}

-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}

/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);

if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);

task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
@@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);

if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
goto out;
}

- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
killed = 1;
}
out:
@@ -728,13 +713,21 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ .force_kill = false,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;

if (!mutex_trylock(&oom_lock))
return;

- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2680,6 +2680,13 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct page *page;

*did_some_progress = 0;
@@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);


2015-06-18 23:00:26

by David Rientjes

[permalink] [raw]
Subject: [patch 2/3] mm, oom: pass an oom order of -1 when triggered by sysrq

The force_kill member of struct oom_context isn't needed if an order of
-1 is used instead.

This patch introduces no functional change.

Signed-off-by: David Rientjes <[email protected]>
---
drivers/tty/sysrq.c | 3 +--
include/linux/oom.h | 1 -
mm/memcontrol.c | 1 -
mm/oom_kill.c | 5 ++---
mm/page_alloc.c | 1 -
5 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
.zonelist = node_zonelist(first_memory_node, gfp_mask),
.nodemask = NULL,
.gfp_mask = gfp_mask,
- .order = 0,
- .force_kill = true,
+ .order = -1,
};

mutex_lock(&oom_lock);
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -17,7 +17,6 @@ struct oom_control {
nodemask_t *nodemask;
gfp_t gfp_mask;
int order;
- bool force_kill;
};

/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.nodemask = NULL,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!oc->force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !oc->force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
.nodemask = NULL,
.gfp_mask = 0,
.order = 0,
- .force_kill = false,
};

if (mem_cgroup_oom_synchronize(true))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2685,7 +2685,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
.nodemask = ac->nodemask,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct page *page;

2015-06-18 23:00:31

by David Rientjes

[permalink] [raw]
Subject: [patch 3/3] mm, oom: do not panic for oom kills triggered from sysrq

Sysrq+f is used to kill a process either for debug or when the VM is
otherwise unresponsive.

It is not intended to trigger a panic when no process may be killed.

Avoid panicking the system for sysrq+f when no processes are killed.

Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
Documentation/sysrq.txt | 3 ++-
mm/oom_kill.c | 7 +++++--
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:

'e' - Send a SIGTERM to all processes, except for init.

-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.

'g' - Used by kgdb (kernel debugger)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
if (constraint != CONSTRAINT_NONE)
return;
}
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)

p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
+ if (!p && oc->order != -1) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
+ if (p && p != (void *)-1UL) {
oom_kill_process(oc, p, points, totalpages, NULL,
"Out of memory");
killed = 1;

2015-06-19 00:14:09

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [patch 1/3] mm, oom: organize oom context into struct

Hello,

On (06/18/15 16:00), David Rientjes wrote:
> There are essential elements to an oom context that are passed around to
> multiple functions.
>
> Organize these elements into a new struct, struct oom_context, that
> specifies the context for an oom condition.
>

s/oom_context/oom_control/ ?

[..]
>
> +struct oom_control {
> + struct zonelist *zonelist;
> + nodemask_t *nodemask;
> + gfp_t gfp_mask;
> + int order;
> + bool force_kill;
> +};
> +
> -extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +extern void check_panic_on_oom(struct oom_control *oc,
> + enum oom_constraint constraint,
> struct mem_cgroup *memcg);
>
> -extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill);
> +extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages);
>
> -extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *mask, bool force_kill);
> +extern bool out_of_memory(struct oom_control *oc);
>
> extern void exit_oom_victim(void);
>
[..]

-ss

2015-06-19 07:30:22

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 1/3] mm, oom: organize oom context into struct

On Thu 18-06-15 16:00:02, David Rientjes wrote:
> There are essential elements to an oom context that are passed around to
> multiple functions.
>
> Organize these elements into a new struct, struct oom_context, that
> specifies the context for an oom condition.

Yes this makes sense to me.

> This patch introduces no functional change.
>
> Signed-off-by: David Rientjes <[email protected]>

Acked-by: Michal Hocko <[email protected]>

> ---
> drivers/tty/sysrq.c | 12 +++++-
> include/linux/oom.h | 25 +++++++-----
> mm/memcontrol.c | 16 +++++---
> mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
> mm/page_alloc.c | 10 ++++-
> 5 files changed, 98 insertions(+), 80 deletions(-)
>
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {
>
> static void moom_callback(struct work_struct *ignored)
> {
> + const gfp_t gfp_mask = GFP_KERNEL;
> + struct oom_control oc = {
> + .zonelist = node_zonelist(first_memory_node, gfp_mask),
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = 0,
> + .force_kill = true,
> + };
> +
> mutex_lock(&oom_lock);
> - if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
> - GFP_KERNEL, 0, NULL, true))
> + if (!out_of_memory(&oc))
> pr_info("OOM request ignored because killer is disabled\n");
> mutex_unlock(&oom_lock);
> }
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -12,6 +12,14 @@ struct notifier_block;
> struct mem_cgroup;
> struct task_struct;
>
> +struct oom_control {
> + struct zonelist *zonelist;
> + nodemask_t *nodemask;
> + gfp_t gfp_mask;
> + int order;
> + bool force_kill;
> +};
> +
> /*
> * Types of limitations to the nodes from which allocations may occur
> */
> @@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,
>
> extern int oom_kills_count(void);
> extern void note_oom_kill(void);
> -extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message);
> + struct mem_cgroup *memcg, const char *message);
>
> -extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +extern void check_panic_on_oom(struct oom_control *oc,
> + enum oom_constraint constraint,
> struct mem_cgroup *memcg);
>
> -extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill);
> +extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages);
>
> -extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *mask, bool force_kill);
> +extern bool out_of_memory(struct oom_control *oc);
>
> extern void exit_oom_victim(void);
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> int order)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> unsigned long totalpages;
> @@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> goto unlock;
> }
>
> - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
> + check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
> totalpages = mem_cgroup_get_limit(memcg) ? : 1;
> for_each_mem_cgroup_tree(iter, memcg) {
> struct css_task_iter it;
> @@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> css_task_iter_start(&iter->css, &it);
> while ((task = css_task_iter_next(&it))) {
> - switch (oom_scan_process_thread(task, totalpages, NULL,
> - false)) {
> + switch (oom_scan_process_thread(&oc, task, totalpages)) {
> case OOM_SCAN_SELECT:
> if (chosen)
> put_task_struct(chosen);
> @@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> if (chosen) {
> points = chosen_points * 1000 / totalpages;
> - oom_kill_process(chosen, gfp_mask, order, points, totalpages,
> - memcg, NULL, "Memory cgroup out of memory");
> + oom_kill_process(&oc, chosen, points, totalpages, memcg,
> + "Memory cgroup out of memory");
> }
> unlock:
> mutex_unlock(&oom_lock);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
> * Determine the type of allocation constraint.
> */
> #ifdef CONFIG_NUMA
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> struct zone *zone;
> struct zoneref *z;
> - enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> + enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
> bool cpuset_limited = false;
> int nid;
>
> /* Default to all available memory */
> *totalpages = totalram_pages + total_swap_pages;
>
> - if (!zonelist)
> + if (!oc->zonelist)
> return CONSTRAINT_NONE;
> /*
> * Reach here only when __GFP_NOFAIL is used. So, we should avoid
> * to kill current.We have to random task kill in this case.
> * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
> */
> - if (gfp_mask & __GFP_THISNODE)
> + if (oc->gfp_mask & __GFP_THISNODE)
> return CONSTRAINT_NONE;
>
> /*
> @@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> * the page allocator means a mempolicy is in effect. Cpuset policy
> * is enforced in get_page_from_freelist().
> */
> - if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
> + if (oc->nodemask &&
> + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
> *totalpages = total_swap_pages;
> - for_each_node_mask(nid, *nodemask)
> + for_each_node_mask(nid, *oc->nodemask)
> *totalpages += node_spanned_pages(nid);
> return CONSTRAINT_MEMORY_POLICY;
> }
>
> /* Check this allocation failure is caused by cpuset's wall function */
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - high_zoneidx, nodemask)
> - if (!cpuset_zone_allowed(zone, gfp_mask))
> + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
> + high_zoneidx, oc->nodemask)
> + if (!cpuset_zone_allowed(zone, oc->gfp_mask))
> cpuset_limited = true;
>
> if (cpuset_limited) {
> @@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> return CONSTRAINT_NONE;
> }
> #else
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> *totalpages = totalram_pages + total_swap_pages;
> return CONSTRAINT_NONE;
> }
> #endif
>
> -enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages)
> {
> - if (oom_unkillable_task(task, NULL, nodemask))
> + if (oom_unkillable_task(task, NULL, oc->nodemask))
> return OOM_SCAN_CONTINUE;
>
> /*
> @@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> * Don't allow any other task to have access to the reserves.
> */
> if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
> - if (!force_kill)
> + if (!oc->force_kill)
> return OOM_SCAN_ABORT;
> }
> if (!task->mm)
> @@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> if (oom_task_origin(task))
> return OOM_SCAN_SELECT;
>
> - if (task_will_free_mem(task) && !force_kill)
> + if (task_will_free_mem(task) && !oc->force_kill)
> return OOM_SCAN_ABORT;
>
> return OOM_SCAN_OK;
> @@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> /*
> * Simple selection loop. We chose the process with the highest
> * number of 'points'. Returns -1 on scan abort.
> - *
> - * (not docbooked, we don't want this one cluttering up the manual)
> */
> -static struct task_struct *select_bad_process(unsigned int *ppoints,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +static struct task_struct *select_bad_process(struct oom_control *oc,
> + unsigned int *ppoints, unsigned long totalpages)
> {
> struct task_struct *g, *p;
> struct task_struct *chosen = NULL;
> @@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> for_each_process_thread(g, p) {
> unsigned int points;
>
> - switch (oom_scan_process_thread(p, totalpages, nodemask,
> - force_kill)) {
> + switch (oom_scan_process_thread(oc, p, totalpages)) {
> case OOM_SCAN_SELECT:
> chosen = p;
> chosen_points = ULONG_MAX;
> @@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> case OOM_SCAN_OK:
> break;
> };
> - points = oom_badness(p, NULL, nodemask, totalpages);
> + points = oom_badness(p, NULL, oc->nodemask, totalpages);
> if (!points || points < chosen_points)
> continue;
> /* Prefer thread group leaders for display purposes */
> @@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> rcu_read_unlock();
> }
>
> -static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> - struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static void dump_header(struct oom_control *oc, struct task_struct *p,
> + struct mem_cgroup *memcg)
> {
> task_lock(current);
> pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
> "oom_score_adj=%hd\n",
> - current->comm, gfp_mask, order,
> + current->comm, oc->gfp_mask, oc->order,
> current->signal->oom_score_adj);
> cpuset_print_task_mems_allowed(current);
> task_unlock(current);
> @@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> else
> show_mem(SHOW_MEM_FILTER_NODES);
> if (sysctl_oom_dump_tasks)
> - dump_tasks(memcg, nodemask);
> + dump_tasks(memcg, oc->nodemask);
> }
>
> /*
> @@ -487,10 +481,9 @@ void oom_killer_enable(void)
> * Must be called while holding a reference to p, which will be released upon
> * returning.
> */
> -void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message)
> + struct mem_cgroup *memcg, const char *message)
> {
> struct task_struct *victim = p;
> struct task_struct *child;
> @@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> task_unlock(p);
>
> if (__ratelimit(&oom_rs))
> - dump_header(p, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, p, memcg);
>
> task_lock(p);
> pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
> @@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * oom_badness() returns 0 if the thread is unkillable
> */
> - child_points = oom_badness(child, memcg, nodemask,
> + child_points = oom_badness(child, memcg, oc->nodemask,
> totalpages);
> if (child_points > victim_points) {
> put_task_struct(victim);
> @@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * Determines whether the kernel must panic because of the panic_on_oom sysctl.
> */
> -void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> struct mem_cgroup *memcg)
> {
> if (likely(!sysctl_panic_on_oom))
> @@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> if (constraint != CONSTRAINT_NONE)
> return;
> }
> - dump_header(NULL, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, NULL, memcg);
> panic("Out of memory: %s panic_on_oom is enabled\n",
> sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> }
> @@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
> EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>
> /**
> - * __out_of_memory - kill the "best" process when we run out of memory
> - * @zonelist: zonelist pointer
> - * @gfp_mask: memory allocation flags
> - * @order: amount of memory being requested as a power of 2
> - * @nodemask: nodemask passed to page allocator
> - * @force_kill: true if a task must be killed, even if others are exiting
> + * out_of_memory - kill the "best" process when we run out of memory
> + * @oc: pointer to struct oom_control
> *
> * If we run out of memory, we have the choice between either
> * killing a random task (bad), letting the system crash (worse)
> * OR try to be smart about which process to kill. Note that we
> * don't have to be perfect here, we just have to be good.
> */
> -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *nodemask, bool force_kill)
> +bool out_of_memory(struct oom_control *oc)
> {
> - const nodemask_t *mpol_mask;
> struct task_struct *p;
> unsigned long totalpages;
> unsigned long freed = 0;
> @@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> * Check if there were limitations on the allocation (only relevant for
> * NUMA) that may require different handling.
> */
> - constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
> - &totalpages);
> - mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
> - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
> + constraint = constrained_alloc(oc, &totalpages);
> + if (constraint != CONSTRAINT_MEMORY_POLICY)
> + oc->nodemask = NULL;
> + check_panic_on_oom(oc, constraint, NULL);
>
> if (sysctl_oom_kill_allocating_task && current->mm &&
> - !oom_unkillable_task(current, NULL, nodemask) &&
> + !oom_unkillable_task(current, NULL, oc->nodemask) &&
> current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> get_task_struct(current);
> - oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
> - nodemask,
> + oom_kill_process(oc, current, 0, totalpages, NULL,
> "Out of memory (oom_kill_allocating_task)");
> goto out;
> }
>
> - p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
> + p = select_bad_process(oc, &points, totalpages);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> if (!p) {
> - dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
> + dump_header(oc, NULL, NULL);
> panic("Out of memory and no killable processes...\n");
> }
> if (p != (void *)-1UL) {
> - oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
> - nodemask, "Out of memory");
> + oom_kill_process(oc, p, points, totalpages, NULL,
> + "Out of memory");
> killed = 1;
> }
> out:
> @@ -728,13 +713,21 @@ out:
> */
> void pagefault_out_of_memory(void)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = 0,
> + .order = 0,
> + .force_kill = false,
> + };
> +
> if (mem_cgroup_oom_synchronize(true))
> return;
>
> if (!mutex_trylock(&oom_lock))
> return;
>
> - if (!out_of_memory(NULL, 0, 0, NULL, false)) {
> + if (!out_of_memory(&oc)) {
> /*
> * There shouldn't be any user tasks runnable while the
> * OOM killer is disabled, so the current task has to
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2680,6 +2680,13 @@ static inline struct page *
> __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> const struct alloc_context *ac, unsigned long *did_some_progress)
> {
> + struct oom_control oc = {
> + .zonelist = ac->zonelist,
> + .nodemask = ac->nodemask,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct page *page;
>
> *did_some_progress = 0;
> @@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> goto out;
> }
> /* Exhausted what can be done so it's blamo time */
> - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
> - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> *did_some_progress = 1;
> out:
> mutex_unlock(&oom_lock);

--
Michal Hocko
SUSE Labs

2015-06-19 07:32:12

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch 2/3] mm, oom: pass an oom order of -1 when triggered by sysrq

On Thu 18-06-15 16:00:07, David Rientjes wrote:
> The force_kill member of struct oom_context isn't needed if an order of
> -1 is used instead.

But this doesn't make much sense to me. It is not like we would _have_
to spare few bytes here. The meaning of force_kill is clear while order
with a weird value is a hack. It is harder to follow without any good
reason.

> This patch introduces no functional change.
>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> drivers/tty/sysrq.c | 3 +--
> include/linux/oom.h | 1 -
> mm/memcontrol.c | 1 -
> mm/oom_kill.c | 5 ++---
> mm/page_alloc.c | 1 -
> 5 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
> .zonelist = node_zonelist(first_memory_node, gfp_mask),
> .nodemask = NULL,
> .gfp_mask = gfp_mask,
> - .order = 0,
> - .force_kill = true,
> + .order = -1,
> };
>
> mutex_lock(&oom_lock);
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -17,7 +17,6 @@ struct oom_control {
> nodemask_t *nodemask;
> gfp_t gfp_mask;
> int order;
> - bool force_kill;
> };
>
> /*
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> .nodemask = NULL,
> .gfp_mask = gfp_mask,
> .order = order,
> - .force_kill = false,
> };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> * Don't allow any other task to have access to the reserves.
> */
> if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
> - if (!oc->force_kill)
> + if (oc->order != -1)
> return OOM_SCAN_ABORT;
> }
> if (!task->mm)
> @@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> if (oom_task_origin(task))
> return OOM_SCAN_SELECT;
>
> - if (task_will_free_mem(task) && !oc->force_kill)
> + if (task_will_free_mem(task) && oc->order != -1)
> return OOM_SCAN_ABORT;
>
> return OOM_SCAN_OK;
> @@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
> .nodemask = NULL,
> .gfp_mask = 0,
> .order = 0,
> - .force_kill = false,
> };
>
> if (mem_cgroup_oom_synchronize(true))
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2685,7 +2685,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> .nodemask = ac->nodemask,
> .gfp_mask = gfp_mask,
> .order = order,
> - .force_kill = false,
> };
> struct page *page;
>

--
Michal Hocko
SUSE Labs

2015-06-30 22:47:10

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/3] mm, oom: organize oom context into struct

On Fri, 19 Jun 2015, Sergey Senozhatsky wrote:

> > There are essential elements to an oom context that are passed around to
> > multiple functions.
> >
> > Organize these elements into a new struct, struct oom_context, that
> > specifies the context for an oom condition.
> >
>
> s/oom_context/oom_control/ ?
>

I think it would be confused with the existing memory.oom_control for
memcg.

2015-06-30 22:50:32

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 2/3] mm, oom: pass an oom order of -1 when triggered by sysrq

On Fri, 19 Jun 2015, Michal Hocko wrote:

> > The force_kill member of struct oom_context isn't needed if an order of
> > -1 is used instead.
>
> But this doesn't make much sense to me. It is not like we would _have_
> to spare few bytes here. The meaning of force_kill is clear while order
> with a weird value is a hack. It is harder to follow without any good
> reason.
>

To me, this is the same as treating order == -1 as special in
struct compact_control meaning that it was triggered from the command line
and we really want to fully compact memory. It seems to have a nice
symmetry.

2015-07-01 00:11:12

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [patch 1/3] mm, oom: organize oom context into struct

On (06/30/15 15:46), David Rientjes wrote:
> > > There are essential elements to an oom context that are passed around to
> > > multiple functions.
> > >
> > > Organize these elements into a new struct, struct oom_context, that
> > > specifies the context for an oom condition.
> > >
> >
> > s/oom_context/oom_control/ ?
> >
>
> I think it would be confused with the existing memory.oom_control for
> memcg.
>

Hello David,

Sorry, I meant that in commit message you say

:Organize these elements into a new struct, struct oom_context, that
:specifies the context for an oom condition.

but define and use `struct oom_control' (not `struct oom_context')

[..]

+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = 0,
+ .force_kill = true,
+ };
+

[..]

+struct oom_control {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ gfp_t gfp_mask;
+ int order;
+ bool force_kill;
+};

[..]

etc.

-ss

2015-07-01 21:30:07

by David Rientjes

[permalink] [raw]
Subject: Re: [patch 1/3] mm, oom: organize oom context into struct

On Wed, 1 Jul 2015, Sergey Senozhatsky wrote:

> On (06/30/15 15:46), David Rientjes wrote:
> > > > There are essential elements to an oom context that are passed around to
> > > > multiple functions.
> > > >
> > > > Organize these elements into a new struct, struct oom_context, that
> > > > specifies the context for an oom condition.
> > > >
> > >
> > > s/oom_context/oom_control/ ?
> > >
> >
> > I think it would be confused with the existing memory.oom_control for
> > memcg.
> >
>
> Hello David,
>
> Sorry, I meant that in commit message you say
>
> :Organize these elements into a new struct, struct oom_context, that
> :specifies the context for an oom condition.
>
> but define and use `struct oom_control' (not `struct oom_context')
>

Oh, point very well taken, thank you.

2015-07-01 21:37:22

by David Rientjes

[permalink] [raw]
Subject: [patch v2 1/3] mm, oom: organize oom context into struct

There are essential elements to an oom context that are passed around to
multiple functions.

Organize these elements into a new struct, struct oom_control, that
specifies the context for an oom condition.

This patch introduces no functional change.

Acked-by: Michal Hocko <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
v2: fix changelog typo per Sergey

drivers/tty/sysrq.c | 12 +++++-
include/linux/oom.h | 25 +++++++-----
mm/memcontrol.c | 16 +++++---
mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
mm/page_alloc.c | 10 ++++-
5 files changed, 98 insertions(+), 80 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {

static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = 0,
+ .force_kill = true,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,6 +12,14 @@ struct notifier_block;
struct mem_cgroup;
struct task_struct;

+struct oom_control {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ gfp_t gfp_mask;
+ int order;
+ bool force_kill;
+};
+
/*
* Types of limitations to the nodes from which allocations may occur
*/
@@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,

extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);

-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);

-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);

-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}

- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;

/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;

- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;

/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}

/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;

if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif

-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;

/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (!oc->force_kill)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && !oc->force_kill)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;

- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}

-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}

/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);

if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);

task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
@@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);

if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
goto out;
}

- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
killed = 1;
}
out:
@@ -728,13 +713,21 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ .force_kill = false,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;

if (!mutex_trylock(&oom_lock))
return;

- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2680,6 +2680,13 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct page *page;

*did_some_progress = 0;
@@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);

2015-07-01 21:37:32

by David Rientjes

[permalink] [raw]
Subject: [patch v2 2/3] mm, oom: organize oom context into struct

The force_kill member of struct oom_control isn't needed if an order of
-1 is used instead. This is the same as order == -1 in
struct compact_control which requires full memory compaction.

This patch introduces no functional change.

Signed-off-by: David Rientjes <[email protected]>
---
v2: fix changelog typo per Sergey

drivers/tty/sysrq.c | 3 +--
include/linux/oom.h | 1 -
mm/memcontrol.c | 1 -
mm/oom_kill.c | 5 ++---
mm/page_alloc.c | 1 -
5 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
.zonelist = node_zonelist(first_memory_node, gfp_mask),
.nodemask = NULL,
.gfp_mask = gfp_mask,
- .order = 0,
- .force_kill = true,
+ .order = -1,
};

mutex_lock(&oom_lock);
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -17,7 +17,6 @@ struct oom_control {
nodemask_t *nodemask;
gfp_t gfp_mask;
int order;
- bool force_kill;
};

/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.nodemask = NULL,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!oc->force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !oc->force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
.nodemask = NULL,
.gfp_mask = 0,
.order = 0,
- .force_kill = false,
};

if (mem_cgroup_oom_synchronize(true))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2685,7 +2685,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
.nodemask = ac->nodemask,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct page *page;

2015-07-01 21:37:51

by David Rientjes

[permalink] [raw]
Subject: [patch v2 3/3] mm, oom: organize oom context into struct

Sysrq+f is used to kill a process either for debug or when the VM is
otherwise unresponsive.

It is not intended to trigger a panic when no process may be killed.

Avoid panicking the system for sysrq+f when no processes are killed.

Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
v2: no change

Documentation/sysrq.txt | 3 ++-
mm/oom_kill.c | 7 +++++--
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:

'e' - Send a SIGTERM to all processes, except for init.

-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.

'g' - Used by kgdb (kernel debugger)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
if (constraint != CONSTRAINT_NONE)
return;
}
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)

p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
+ if (!p && oc->order != -1) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
+ if (p && p != (void *)-1UL) {
oom_kill_process(oc, p, points, totalpages, NULL,
"Out of memory");
killed = 1;

2015-07-02 03:38:59

by Hillf Danton

[permalink] [raw]
Subject: Re: [patch v2 1/3] mm, oom: organize oom context into struct

> Subject: [patch v2 1/3] mm, oom: organize oom context into struct
[patch v2 2/3] mm, oom: organize oom context into struct
[patch v2 3/3] mm, oom: organize oom context into struct

I am wondering if a redelivery is needed for the same 3 subject lines.

Hillf
>
> There are essential elements to an oom context that are passed around to
> multiple functions.
>
> Organize these elements into a new struct, struct oom_control, that
> specifies the context for an oom condition.
>
> This patch introduces no functional change.
>
> Acked-by: Michal Hocko <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> v2: fix changelog typo per Sergey
>
> drivers/tty/sysrq.c | 12 +++++-
> include/linux/oom.h | 25 +++++++-----
> mm/memcontrol.c | 16 +++++---
> mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
> mm/page_alloc.c | 10 ++++-
> 5 files changed, 98 insertions(+), 80 deletions(-)
>
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {
>
> static void moom_callback(struct work_struct *ignored)
> {
> + const gfp_t gfp_mask = GFP_KERNEL;
> + struct oom_control oc = {
> + .zonelist = node_zonelist(first_memory_node, gfp_mask),
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = 0,
> + .force_kill = true,
> + };
> +
> mutex_lock(&oom_lock);
> - if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
> - GFP_KERNEL, 0, NULL, true))
> + if (!out_of_memory(&oc))
> pr_info("OOM request ignored because killer is disabled\n");
> mutex_unlock(&oom_lock);
> }
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -12,6 +12,14 @@ struct notifier_block;
> struct mem_cgroup;
> struct task_struct;
>
> +struct oom_control {
> + struct zonelist *zonelist;
> + nodemask_t *nodemask;
> + gfp_t gfp_mask;
> + int order;
> + bool force_kill;
> +};
> +
> /*
> * Types of limitations to the nodes from which allocations may occur
> */
> @@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,
>
> extern int oom_kills_count(void);
> extern void note_oom_kill(void);
> -extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message);
> + struct mem_cgroup *memcg, const char *message);
>
> -extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +extern void check_panic_on_oom(struct oom_control *oc,
> + enum oom_constraint constraint,
> struct mem_cgroup *memcg);
>
> -extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill);
> +extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages);
>
> -extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *mask, bool force_kill);
> +extern bool out_of_memory(struct oom_control *oc);
>
> extern void exit_oom_victim(void);
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> int order)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> unsigned long totalpages;
> @@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> goto unlock;
> }
>
> - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
> + check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
> totalpages = mem_cgroup_get_limit(memcg) ? : 1;
> for_each_mem_cgroup_tree(iter, memcg) {
> struct css_task_iter it;
> @@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> css_task_iter_start(&iter->css, &it);
> while ((task = css_task_iter_next(&it))) {
> - switch (oom_scan_process_thread(task, totalpages, NULL,
> - false)) {
> + switch (oom_scan_process_thread(&oc, task, totalpages)) {
> case OOM_SCAN_SELECT:
> if (chosen)
> put_task_struct(chosen);
> @@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> if (chosen) {
> points = chosen_points * 1000 / totalpages;
> - oom_kill_process(chosen, gfp_mask, order, points, totalpages,
> - memcg, NULL, "Memory cgroup out of memory");
> + oom_kill_process(&oc, chosen, points, totalpages, memcg,
> + "Memory cgroup out of memory");
> }
> unlock:
> mutex_unlock(&oom_lock);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
> * Determine the type of allocation constraint.
> */
> #ifdef CONFIG_NUMA
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> struct zone *zone;
> struct zoneref *z;
> - enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> + enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
> bool cpuset_limited = false;
> int nid;
>
> /* Default to all available memory */
> *totalpages = totalram_pages + total_swap_pages;
>
> - if (!zonelist)
> + if (!oc->zonelist)
> return CONSTRAINT_NONE;
> /*
> * Reach here only when __GFP_NOFAIL is used. So, we should avoid
> * to kill current.We have to random task kill in this case.
> * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
> */
> - if (gfp_mask & __GFP_THISNODE)
> + if (oc->gfp_mask & __GFP_THISNODE)
> return CONSTRAINT_NONE;
>
> /*
> @@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> * the page allocator means a mempolicy is in effect. Cpuset policy
> * is enforced in get_page_from_freelist().
> */
> - if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
> + if (oc->nodemask &&
> + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
> *totalpages = total_swap_pages;
> - for_each_node_mask(nid, *nodemask)
> + for_each_node_mask(nid, *oc->nodemask)
> *totalpages += node_spanned_pages(nid);
> return CONSTRAINT_MEMORY_POLICY;
> }
>
> /* Check this allocation failure is caused by cpuset's wall function */
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - high_zoneidx, nodemask)
> - if (!cpuset_zone_allowed(zone, gfp_mask))
> + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
> + high_zoneidx, oc->nodemask)
> + if (!cpuset_zone_allowed(zone, oc->gfp_mask))
> cpuset_limited = true;
>
> if (cpuset_limited) {
> @@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> return CONSTRAINT_NONE;
> }
> #else
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> *totalpages = totalram_pages + total_swap_pages;
> return CONSTRAINT_NONE;
> }
> #endif
>
> -enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages)
> {
> - if (oom_unkillable_task(task, NULL, nodemask))
> + if (oom_unkillable_task(task, NULL, oc->nodemask))
> return OOM_SCAN_CONTINUE;
>
> /*
> @@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> * Don't allow any other task to have access to the reserves.
> */
> if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
> - if (!force_kill)
> + if (!oc->force_kill)
> return OOM_SCAN_ABORT;
> }
> if (!task->mm)
> @@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> if (oom_task_origin(task))
> return OOM_SCAN_SELECT;
>
> - if (task_will_free_mem(task) && !force_kill)
> + if (task_will_free_mem(task) && !oc->force_kill)
> return OOM_SCAN_ABORT;
>
> return OOM_SCAN_OK;
> @@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> /*
> * Simple selection loop. We chose the process with the highest
> * number of 'points'. Returns -1 on scan abort.
> - *
> - * (not docbooked, we don't want this one cluttering up the manual)
> */
> -static struct task_struct *select_bad_process(unsigned int *ppoints,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +static struct task_struct *select_bad_process(struct oom_control *oc,
> + unsigned int *ppoints, unsigned long totalpages)
> {
> struct task_struct *g, *p;
> struct task_struct *chosen = NULL;
> @@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> for_each_process_thread(g, p) {
> unsigned int points;
>
> - switch (oom_scan_process_thread(p, totalpages, nodemask,
> - force_kill)) {
> + switch (oom_scan_process_thread(oc, p, totalpages)) {
> case OOM_SCAN_SELECT:
> chosen = p;
> chosen_points = ULONG_MAX;
> @@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> case OOM_SCAN_OK:
> break;
> };
> - points = oom_badness(p, NULL, nodemask, totalpages);
> + points = oom_badness(p, NULL, oc->nodemask, totalpages);
> if (!points || points < chosen_points)
> continue;
> /* Prefer thread group leaders for display purposes */
> @@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> rcu_read_unlock();
> }
>
> -static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> - struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static void dump_header(struct oom_control *oc, struct task_struct *p,
> + struct mem_cgroup *memcg)
> {
> task_lock(current);
> pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
> "oom_score_adj=%hd\n",
> - current->comm, gfp_mask, order,
> + current->comm, oc->gfp_mask, oc->order,
> current->signal->oom_score_adj);
> cpuset_print_task_mems_allowed(current);
> task_unlock(current);
> @@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> else
> show_mem(SHOW_MEM_FILTER_NODES);
> if (sysctl_oom_dump_tasks)
> - dump_tasks(memcg, nodemask);
> + dump_tasks(memcg, oc->nodemask);
> }
>
> /*
> @@ -487,10 +481,9 @@ void oom_killer_enable(void)
> * Must be called while holding a reference to p, which will be released upon
> * returning.
> */
> -void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message)
> + struct mem_cgroup *memcg, const char *message)
> {
> struct task_struct *victim = p;
> struct task_struct *child;
> @@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> task_unlock(p);
>
> if (__ratelimit(&oom_rs))
> - dump_header(p, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, p, memcg);
>
> task_lock(p);
> pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
> @@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * oom_badness() returns 0 if the thread is unkillable
> */
> - child_points = oom_badness(child, memcg, nodemask,
> + child_points = oom_badness(child, memcg, oc->nodemask,
> totalpages);
> if (child_points > victim_points) {
> put_task_struct(victim);
> @@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * Determines whether the kernel must panic because of the panic_on_oom sysctl.
> */
> -void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> struct mem_cgroup *memcg)
> {
> if (likely(!sysctl_panic_on_oom))
> @@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> if (constraint != CONSTRAINT_NONE)
> return;
> }
> - dump_header(NULL, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, NULL, memcg);
> panic("Out of memory: %s panic_on_oom is enabled\n",
> sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> }
> @@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
> EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>
> /**
> - * __out_of_memory - kill the "best" process when we run out of memory
> - * @zonelist: zonelist pointer
> - * @gfp_mask: memory allocation flags
> - * @order: amount of memory being requested as a power of 2
> - * @nodemask: nodemask passed to page allocator
> - * @force_kill: true if a task must be killed, even if others are exiting
> + * out_of_memory - kill the "best" process when we run out of memory
> + * @oc: pointer to struct oom_control
> *
> * If we run out of memory, we have the choice between either
> * killing a random task (bad), letting the system crash (worse)
> * OR try to be smart about which process to kill. Note that we
> * don't have to be perfect here, we just have to be good.
> */
> -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *nodemask, bool force_kill)
> +bool out_of_memory(struct oom_control *oc)
> {
> - const nodemask_t *mpol_mask;
> struct task_struct *p;
> unsigned long totalpages;
> unsigned long freed = 0;
> @@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> * Check if there were limitations on the allocation (only relevant for
> * NUMA) that may require different handling.
> */
> - constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
> - &totalpages);
> - mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
> - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
> + constraint = constrained_alloc(oc, &totalpages);
> + if (constraint != CONSTRAINT_MEMORY_POLICY)
> + oc->nodemask = NULL;
> + check_panic_on_oom(oc, constraint, NULL);
>
> if (sysctl_oom_kill_allocating_task && current->mm &&
> - !oom_unkillable_task(current, NULL, nodemask) &&
> + !oom_unkillable_task(current, NULL, oc->nodemask) &&
> current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> get_task_struct(current);
> - oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
> - nodemask,
> + oom_kill_process(oc, current, 0, totalpages, NULL,
> "Out of memory (oom_kill_allocating_task)");
> goto out;
> }
>
> - p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
> + p = select_bad_process(oc, &points, totalpages);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> if (!p) {
> - dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
> + dump_header(oc, NULL, NULL);
> panic("Out of memory and no killable processes...\n");
> }
> if (p != (void *)-1UL) {
> - oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
> - nodemask, "Out of memory");
> + oom_kill_process(oc, p, points, totalpages, NULL,
> + "Out of memory");
> killed = 1;
> }
> out:
> @@ -728,13 +713,21 @@ out:
> */
> void pagefault_out_of_memory(void)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = 0,
> + .order = 0,
> + .force_kill = false,
> + };
> +
> if (mem_cgroup_oom_synchronize(true))
> return;
>
> if (!mutex_trylock(&oom_lock))
> return;
>
> - if (!out_of_memory(NULL, 0, 0, NULL, false)) {
> + if (!out_of_memory(&oc)) {
> /*
> * There shouldn't be any user tasks runnable while the
> * OOM killer is disabled, so the current task has to
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2680,6 +2680,13 @@ static inline struct page *
> __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> const struct alloc_context *ac, unsigned long *did_some_progress)
> {
> + struct oom_control oc = {
> + .zonelist = ac->zonelist,
> + .nodemask = ac->nodemask,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct page *page;
>
> *did_some_progress = 0;
> @@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> goto out;
> }
> /* Exhausted what can be done so it's blamo time */
> - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
> - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> *did_some_progress = 1;
> out:
> mutex_unlock(&oom_lock);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2015-07-02 06:01:41

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch v2 1/3] mm, oom: organize oom context into struct

JFYI. I have added this patch of yours into my series which is fixing
other sysrq+f issue and it is waiting for the merge window to close.

On Wed 01-07-15 14:37:07, David Rientjes wrote:
> There are essential elements to an oom context that are passed around to
> multiple functions.
>
> Organize these elements into a new struct, struct oom_control, that
> specifies the context for an oom condition.
>
> This patch introduces no functional change.
>
> Acked-by: Michal Hocko <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> v2: fix changelog typo per Sergey
>
> drivers/tty/sysrq.c | 12 +++++-
> include/linux/oom.h | 25 +++++++-----
> mm/memcontrol.c | 16 +++++---
> mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
> mm/page_alloc.c | 10 ++++-
> 5 files changed, 98 insertions(+), 80 deletions(-)
>
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {
>
> static void moom_callback(struct work_struct *ignored)
> {
> + const gfp_t gfp_mask = GFP_KERNEL;
> + struct oom_control oc = {
> + .zonelist = node_zonelist(first_memory_node, gfp_mask),
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = 0,
> + .force_kill = true,
> + };
> +
> mutex_lock(&oom_lock);
> - if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
> - GFP_KERNEL, 0, NULL, true))
> + if (!out_of_memory(&oc))
> pr_info("OOM request ignored because killer is disabled\n");
> mutex_unlock(&oom_lock);
> }
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -12,6 +12,14 @@ struct notifier_block;
> struct mem_cgroup;
> struct task_struct;
>
> +struct oom_control {
> + struct zonelist *zonelist;
> + nodemask_t *nodemask;
> + gfp_t gfp_mask;
> + int order;
> + bool force_kill;
> +};
> +
> /*
> * Types of limitations to the nodes from which allocations may occur
> */
> @@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,
>
> extern int oom_kills_count(void);
> extern void note_oom_kill(void);
> -extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message);
> + struct mem_cgroup *memcg, const char *message);
>
> -extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +extern void check_panic_on_oom(struct oom_control *oc,
> + enum oom_constraint constraint,
> struct mem_cgroup *memcg);
>
> -extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill);
> +extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages);
>
> -extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *mask, bool force_kill);
> +extern bool out_of_memory(struct oom_control *oc);
>
> extern void exit_oom_victim(void);
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> int order)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> unsigned long totalpages;
> @@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> goto unlock;
> }
>
> - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
> + check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
> totalpages = mem_cgroup_get_limit(memcg) ? : 1;
> for_each_mem_cgroup_tree(iter, memcg) {
> struct css_task_iter it;
> @@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> css_task_iter_start(&iter->css, &it);
> while ((task = css_task_iter_next(&it))) {
> - switch (oom_scan_process_thread(task, totalpages, NULL,
> - false)) {
> + switch (oom_scan_process_thread(&oc, task, totalpages)) {
> case OOM_SCAN_SELECT:
> if (chosen)
> put_task_struct(chosen);
> @@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> if (chosen) {
> points = chosen_points * 1000 / totalpages;
> - oom_kill_process(chosen, gfp_mask, order, points, totalpages,
> - memcg, NULL, "Memory cgroup out of memory");
> + oom_kill_process(&oc, chosen, points, totalpages, memcg,
> + "Memory cgroup out of memory");
> }
> unlock:
> mutex_unlock(&oom_lock);
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
> * Determine the type of allocation constraint.
> */
> #ifdef CONFIG_NUMA
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> struct zone *zone;
> struct zoneref *z;
> - enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> + enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
> bool cpuset_limited = false;
> int nid;
>
> /* Default to all available memory */
> *totalpages = totalram_pages + total_swap_pages;
>
> - if (!zonelist)
> + if (!oc->zonelist)
> return CONSTRAINT_NONE;
> /*
> * Reach here only when __GFP_NOFAIL is used. So, we should avoid
> * to kill current.We have to random task kill in this case.
> * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
> */
> - if (gfp_mask & __GFP_THISNODE)
> + if (oc->gfp_mask & __GFP_THISNODE)
> return CONSTRAINT_NONE;
>
> /*
> @@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> * the page allocator means a mempolicy is in effect. Cpuset policy
> * is enforced in get_page_from_freelist().
> */
> - if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
> + if (oc->nodemask &&
> + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
> *totalpages = total_swap_pages;
> - for_each_node_mask(nid, *nodemask)
> + for_each_node_mask(nid, *oc->nodemask)
> *totalpages += node_spanned_pages(nid);
> return CONSTRAINT_MEMORY_POLICY;
> }
>
> /* Check this allocation failure is caused by cpuset's wall function */
> - for_each_zone_zonelist_nodemask(zone, z, zonelist,
> - high_zoneidx, nodemask)
> - if (!cpuset_zone_allowed(zone, gfp_mask))
> + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
> + high_zoneidx, oc->nodemask)
> + if (!cpuset_zone_allowed(zone, oc->gfp_mask))
> cpuset_limited = true;
>
> if (cpuset_limited) {
> @@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> return CONSTRAINT_NONE;
> }
> #else
> -static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
> - gfp_t gfp_mask, nodemask_t *nodemask,
> - unsigned long *totalpages)
> +static enum oom_constraint constrained_alloc(struct oom_control *oc,
> + unsigned long *totalpages)
> {
> *totalpages = totalram_pages + total_swap_pages;
> return CONSTRAINT_NONE;
> }
> #endif
>
> -enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> + struct task_struct *task, unsigned long totalpages)
> {
> - if (oom_unkillable_task(task, NULL, nodemask))
> + if (oom_unkillable_task(task, NULL, oc->nodemask))
> return OOM_SCAN_CONTINUE;
>
> /*
> @@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> * Don't allow any other task to have access to the reserves.
> */
> if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
> - if (!force_kill)
> + if (!oc->force_kill)
> return OOM_SCAN_ABORT;
> }
> if (!task->mm)
> @@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> if (oom_task_origin(task))
> return OOM_SCAN_SELECT;
>
> - if (task_will_free_mem(task) && !force_kill)
> + if (task_will_free_mem(task) && !oc->force_kill)
> return OOM_SCAN_ABORT;
>
> return OOM_SCAN_OK;
> @@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> /*
> * Simple selection loop. We chose the process with the highest
> * number of 'points'. Returns -1 on scan abort.
> - *
> - * (not docbooked, we don't want this one cluttering up the manual)
> */
> -static struct task_struct *select_bad_process(unsigned int *ppoints,
> - unsigned long totalpages, const nodemask_t *nodemask,
> - bool force_kill)
> +static struct task_struct *select_bad_process(struct oom_control *oc,
> + unsigned int *ppoints, unsigned long totalpages)
> {
> struct task_struct *g, *p;
> struct task_struct *chosen = NULL;
> @@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> for_each_process_thread(g, p) {
> unsigned int points;
>
> - switch (oom_scan_process_thread(p, totalpages, nodemask,
> - force_kill)) {
> + switch (oom_scan_process_thread(oc, p, totalpages)) {
> case OOM_SCAN_SELECT:
> chosen = p;
> chosen_points = ULONG_MAX;
> @@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> case OOM_SCAN_OK:
> break;
> };
> - points = oom_badness(p, NULL, nodemask, totalpages);
> + points = oom_badness(p, NULL, oc->nodemask, totalpages);
> if (!points || points < chosen_points)
> continue;
> /* Prefer thread group leaders for display purposes */
> @@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
> rcu_read_unlock();
> }
>
> -static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> - struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +static void dump_header(struct oom_control *oc, struct task_struct *p,
> + struct mem_cgroup *memcg)
> {
> task_lock(current);
> pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
> "oom_score_adj=%hd\n",
> - current->comm, gfp_mask, order,
> + current->comm, oc->gfp_mask, oc->order,
> current->signal->oom_score_adj);
> cpuset_print_task_mems_allowed(current);
> task_unlock(current);
> @@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
> else
> show_mem(SHOW_MEM_FILTER_NODES);
> if (sysctl_oom_dump_tasks)
> - dump_tasks(memcg, nodemask);
> + dump_tasks(memcg, oc->nodemask);
> }
>
> /*
> @@ -487,10 +481,9 @@ void oom_killer_enable(void)
> * Must be called while holding a reference to p, which will be released upon
> * returning.
> */
> -void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> +void oom_kill_process(struct oom_control *oc, struct task_struct *p,
> unsigned int points, unsigned long totalpages,
> - struct mem_cgroup *memcg, nodemask_t *nodemask,
> - const char *message)
> + struct mem_cgroup *memcg, const char *message)
> {
> struct task_struct *victim = p;
> struct task_struct *child;
> @@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> task_unlock(p);
>
> if (__ratelimit(&oom_rs))
> - dump_header(p, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, p, memcg);
>
> task_lock(p);
> pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
> @@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * oom_badness() returns 0 if the thread is unkillable
> */
> - child_points = oom_badness(child, memcg, nodemask,
> + child_points = oom_badness(child, memcg, oc->nodemask,
> totalpages);
> if (child_points > victim_points) {
> put_task_struct(victim);
> @@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
> /*
> * Determines whether the kernel must panic because of the panic_on_oom sysctl.
> */
> -void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> - int order, const nodemask_t *nodemask,
> +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> struct mem_cgroup *memcg)
> {
> if (likely(!sysctl_panic_on_oom))
> @@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
> if (constraint != CONSTRAINT_NONE)
> return;
> }
> - dump_header(NULL, gfp_mask, order, memcg, nodemask);
> + dump_header(oc, NULL, memcg);
> panic("Out of memory: %s panic_on_oom is enabled\n",
> sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> }
> @@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
> EXPORT_SYMBOL_GPL(unregister_oom_notifier);
>
> /**
> - * __out_of_memory - kill the "best" process when we run out of memory
> - * @zonelist: zonelist pointer
> - * @gfp_mask: memory allocation flags
> - * @order: amount of memory being requested as a power of 2
> - * @nodemask: nodemask passed to page allocator
> - * @force_kill: true if a task must be killed, even if others are exiting
> + * out_of_memory - kill the "best" process when we run out of memory
> + * @oc: pointer to struct oom_control
> *
> * If we run out of memory, we have the choice between either
> * killing a random task (bad), letting the system crash (worse)
> * OR try to be smart about which process to kill. Note that we
> * don't have to be perfect here, we just have to be good.
> */
> -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> - int order, nodemask_t *nodemask, bool force_kill)
> +bool out_of_memory(struct oom_control *oc)
> {
> - const nodemask_t *mpol_mask;
> struct task_struct *p;
> unsigned long totalpages;
> unsigned long freed = 0;
> @@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> * Check if there were limitations on the allocation (only relevant for
> * NUMA) that may require different handling.
> */
> - constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
> - &totalpages);
> - mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
> - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
> + constraint = constrained_alloc(oc, &totalpages);
> + if (constraint != CONSTRAINT_MEMORY_POLICY)
> + oc->nodemask = NULL;
> + check_panic_on_oom(oc, constraint, NULL);
>
> if (sysctl_oom_kill_allocating_task && current->mm &&
> - !oom_unkillable_task(current, NULL, nodemask) &&
> + !oom_unkillable_task(current, NULL, oc->nodemask) &&
> current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
> get_task_struct(current);
> - oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
> - nodemask,
> + oom_kill_process(oc, current, 0, totalpages, NULL,
> "Out of memory (oom_kill_allocating_task)");
> goto out;
> }
>
> - p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
> + p = select_bad_process(oc, &points, totalpages);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> if (!p) {
> - dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
> + dump_header(oc, NULL, NULL);
> panic("Out of memory and no killable processes...\n");
> }
> if (p != (void *)-1UL) {
> - oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
> - nodemask, "Out of memory");
> + oom_kill_process(oc, p, points, totalpages, NULL,
> + "Out of memory");
> killed = 1;
> }
> out:
> @@ -728,13 +713,21 @@ out:
> */
> void pagefault_out_of_memory(void)
> {
> + struct oom_control oc = {
> + .zonelist = NULL,
> + .nodemask = NULL,
> + .gfp_mask = 0,
> + .order = 0,
> + .force_kill = false,
> + };
> +
> if (mem_cgroup_oom_synchronize(true))
> return;
>
> if (!mutex_trylock(&oom_lock))
> return;
>
> - if (!out_of_memory(NULL, 0, 0, NULL, false)) {
> + if (!out_of_memory(&oc)) {
> /*
> * There shouldn't be any user tasks runnable while the
> * OOM killer is disabled, so the current task has to
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2680,6 +2680,13 @@ static inline struct page *
> __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> const struct alloc_context *ac, unsigned long *did_some_progress)
> {
> + struct oom_control oc = {
> + .zonelist = ac->zonelist,
> + .nodemask = ac->nodemask,
> + .gfp_mask = gfp_mask,
> + .order = order,
> + .force_kill = false,
> + };
> struct page *page;
>
> *did_some_progress = 0;
> @@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> goto out;
> }
> /* Exhausted what can be done so it's blamo time */
> - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
> - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
> *did_some_progress = 1;
> out:
> mutex_unlock(&oom_lock);

--
Michal Hocko
SUSE Labs

2015-07-02 06:02:04

by Michal Hocko

[permalink] [raw]
Subject: Re: [patch v2 2/3] mm, oom: organize oom context into struct

On Wed 01-07-15 14:37:14, David Rientjes wrote:
> The force_kill member of struct oom_control isn't needed if an order of
> -1 is used instead. This is the same as order == -1 in
> struct compact_control which requires full memory compaction.
>
> This patch introduces no functional change.

But it obscures the code and I really dislike this change as pointed out
previously.

> Signed-off-by: David Rientjes <[email protected]>
> ---
> v2: fix changelog typo per Sergey
>
> drivers/tty/sysrq.c | 3 +--
> include/linux/oom.h | 1 -
> mm/memcontrol.c | 1 -
> mm/oom_kill.c | 5 ++---
> mm/page_alloc.c | 1 -
> 5 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
> --- a/drivers/tty/sysrq.c
> +++ b/drivers/tty/sysrq.c
> @@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
> .zonelist = node_zonelist(first_memory_node, gfp_mask),
> .nodemask = NULL,
> .gfp_mask = gfp_mask,
> - .order = 0,
> - .force_kill = true,
> + .order = -1,
> };
>
> mutex_lock(&oom_lock);
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -17,7 +17,6 @@ struct oom_control {
> nodemask_t *nodemask;
> gfp_t gfp_mask;
> int order;
> - bool force_kill;
> };
>
> /*
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> .nodemask = NULL,
> .gfp_mask = gfp_mask,
> .order = order,
> - .force_kill = false,
> };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> * Don't allow any other task to have access to the reserves.
> */
> if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
> - if (!oc->force_kill)
> + if (oc->order != -1)
> return OOM_SCAN_ABORT;
> }
> if (!task->mm)
> @@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
> if (oom_task_origin(task))
> return OOM_SCAN_SELECT;
>
> - if (task_will_free_mem(task) && !oc->force_kill)
> + if (task_will_free_mem(task) && oc->order != -1)
> return OOM_SCAN_ABORT;
>
> return OOM_SCAN_OK;
> @@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
> .nodemask = NULL,
> .gfp_mask = 0,
> .order = 0,
> - .force_kill = false,
> };
>
> if (mem_cgroup_oom_synchronize(true))
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2685,7 +2685,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
> .nodemask = ac->nodemask,
> .gfp_mask = gfp_mask,
> .order = order,
> - .force_kill = false,
> };
> struct page *page;
>

--
Michal Hocko
SUSE Labs

2015-07-08 23:28:51

by David Rientjes

[permalink] [raw]
Subject: Re: [patch v2 2/3] mm, oom: organize oom context into struct

On Thu, 2 Jul 2015, Michal Hocko wrote:

> On Wed 01-07-15 14:37:14, David Rientjes wrote:
> > The force_kill member of struct oom_control isn't needed if an order of
> > -1 is used instead. This is the same as order == -1 in
> > struct compact_control which requires full memory compaction.
> >
> > This patch introduces no functional change.
>
> But it obscures the code and I really dislike this change as pointed out
> previously.
>

The oom killer is often called at the end of a very lengthy stack since
memory allocation itself can be called deep in the stack. Thus, reducing
the amount of memory, even for a small lil bool, is helpful. This is
especially true when other such structs, struct compact_control, does the
exact same thing by using order == -1 to mean explicit compaction.

I'm personally tired of fixing stack overflows and you're arguing against
"obscurity" that even occurs in other parts of the mm code.
oc->force_kill has no reason to exist, and thus it's removed in this patch
and for good reason.

2015-07-08 23:42:52

by David Rientjes

[permalink] [raw]
Subject: [patch v3 1/3] mm, oom: organize oom context into struct

There are essential elements to an oom context that are passed around to
multiple functions.

Organize these elements into a new struct, struct oom_control, that
specifies the context for an oom condition.

This patch introduces no functional change.

Acked-by: Michal Hocko <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
v2: fix changelog typo per Sergey
v3: no change

drivers/tty/sysrq.c | 12 +++++-
include/linux/oom.h | 25 +++++++-----
mm/memcontrol.c | 16 +++++---
mm/oom_kill.c | 115 ++++++++++++++++++++++++----------------------------
mm/page_alloc.c | 10 ++++-
5 files changed, 98 insertions(+), 80 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,17 @@ static struct sysrq_key_op sysrq_term_op = {

static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = 0,
+ .force_kill = true,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -12,6 +12,14 @@ struct notifier_block;
struct mem_cgroup;
struct task_struct;

+struct oom_control {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ gfp_t gfp_mask;
+ int order;
+ bool force_kill;
+};
+
/*
* Types of limitations to the nodes from which allocations may occur
*/
@@ -57,21 +65,18 @@ extern unsigned long oom_badness(struct task_struct *p,

extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);

-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);

-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);

-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);

extern void exit_oom_victim(void);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1545,6 +1545,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1570,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
goto unlock;
}

- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1578,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1616,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;

/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;

- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;

/*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}

/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;

if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif

-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;

/*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (!oc->force_kill)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && !oc->force_kill)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
for_each_process_thread(g, p) {
unsigned int points;

- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}

-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}

/*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);

if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);

task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,22 +627,16 @@ int unregister_oom_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_oom_notifier);

/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
@@ -684,30 +670,29 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);

if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
goto out;
}

- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
killed = 1;
}
out:
@@ -728,13 +713,21 @@ out:
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ .force_kill = false,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;

if (!mutex_trylock(&oom_lock))
return;

- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2680,6 +2680,13 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ .force_kill = false,
+ };
struct page *page;

*did_some_progress = 0;
@@ -2731,8 +2738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);

2015-07-08 23:43:06

by David Rientjes

[permalink] [raw]
Subject: [patch v3 2/3] mm, oom: pass an oom order of -1 when triggered by sysrq

The force_kill member of struct oom_control isn't needed if an order of
-1 is used instead. This is the same as order == -1 in
struct compact_control which requires full memory compaction.

This patch introduces no functional change.

Signed-off-by: David Rientjes <[email protected]>
---
v2: fix changelog typo per Sergey
v3: fix title per Hillf

drivers/tty/sysrq.c | 3 +--
include/linux/oom.h | 1 -
mm/memcontrol.c | 1 -
mm/oom_kill.c | 5 ++---
mm/page_alloc.c | 1 -
5 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -358,8 +358,7 @@ static void moom_callback(struct work_struct *ignored)
.zonelist = node_zonelist(first_memory_node, gfp_mask),
.nodemask = NULL,
.gfp_mask = gfp_mask,
- .order = 0,
- .force_kill = true,
+ .order = -1,
};

mutex_lock(&oom_lock);
diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -17,7 +17,6 @@ struct oom_control {
nodemask_t *nodemask;
gfp_t gfp_mask;
int order;
- bool force_kill;
};

/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1550,7 +1550,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.nodemask = NULL,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -265,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!oc->force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -278,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;

- if (task_will_free_mem(task) && !oc->force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;

return OOM_SCAN_OK;
@@ -718,7 +718,6 @@ void pagefault_out_of_memory(void)
.nodemask = NULL,
.gfp_mask = 0,
.order = 0,
- .force_kill = false,
};

if (mem_cgroup_oom_synchronize(true))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2685,7 +2685,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
.nodemask = ac->nodemask,
.gfp_mask = gfp_mask,
.order = order,
- .force_kill = false,
};
struct page *page;

2015-07-08 23:43:15

by David Rientjes

[permalink] [raw]
Subject: [patch v3 3/3] mm, oom: do not panic for oom kills triggered from sysrq

Sysrq+f is used to kill a process either for debug or when the VM is
otherwise unresponsive.

It is not intended to trigger a panic when no process may be killed.

Avoid panicking the system for sysrq+f when no processes are killed.

Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: David Rientjes <[email protected]>
---
v2: no change
v3: fix title per Hillf

Documentation/sysrq.txt | 3 ++-
mm/oom_kill.c | 7 +++++--
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:

'e' - Send a SIGTERM to all processes, except for init.

-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.

'g' - Used by kgdb (kernel debugger)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
if (constraint != CONSTRAINT_NONE)
return;
}
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)

p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
+ if (!p && oc->order != -1) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
+ if (p && p != (void *)-1UL) {
oom_kill_process(oc, p, points, totalpages, NULL,
"Out of memory");
killed = 1;

2015-07-09 04:16:13

by Hillf Danton

[permalink] [raw]
Subject: Re: [patch v3 3/3] mm, oom: do not panic for oom kills triggered from sysrq

> Sysrq+f is used to kill a process either for debug or when the VM is
> otherwise unresponsive.
>
> It is not intended to trigger a panic when no process may be killed.
>
> Avoid panicking the system for sysrq+f when no processes are killed.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: David Rientjes <[email protected]>
> ---
> v2: no change
> v3: fix title per Hillf
>
> Documentation/sysrq.txt | 3 ++-
> mm/oom_kill.c | 7 +++++--
> 2 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
> --- a/Documentation/sysrq.txt
> +++ b/Documentation/sysrq.txt
> @@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
>
> 'e' - Send a SIGTERM to all processes, except for init.
>
> -'f' - Will call oom_kill to kill a memory hog process.
> +'f' - Will call the oom killer to kill a memory hog process, but do not
> + panic if nothing can be killed.
>
> 'g' - Used by kgdb (kernel debugger)
>
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> if (constraint != CONSTRAINT_NONE)
> return;
> }
> + /* Do not panic for oom kills triggered by sysrq */
> + if (oc->order == -1)
> + return;
> dump_header(oc, NULL, memcg);
> panic("Out of memory: %s panic_on_oom is enabled\n",
> sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> @@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)
>
> p = select_bad_process(oc, &points, totalpages);
> /* Found nothing?!?! Either we hang forever, or we panic. */
> - if (!p) {
> + if (!p && oc->order != -1) {
> dump_header(oc, NULL, NULL);
> panic("Out of memory and no killable processes...\n");
> }

Given sysctl_panic_on_oom checked, AFAICU there seems
no chance for panic, no matter -1 or not.

> - if (p != (void *)-1UL) {
> + if (p && p != (void *)-1UL) {
> oom_kill_process(oc, p, points, totalpages, NULL,
> "Out of memory");
> killed = 1;
> --

2015-07-09 21:30:59

by David Rientjes

[permalink] [raw]
Subject: Re: [patch v3 3/3] mm, oom: do not panic for oom kills triggered from sysrq

On Thu, 9 Jul 2015, Hillf Danton wrote:

> > diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
> > --- a/Documentation/sysrq.txt
> > +++ b/Documentation/sysrq.txt
> > @@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
> >
> > 'e' - Send a SIGTERM to all processes, except for init.
> >
> > -'f' - Will call oom_kill to kill a memory hog process.
> > +'f' - Will call the oom killer to kill a memory hog process, but do not
> > + panic if nothing can be killed.
> >
> > 'g' - Used by kgdb (kernel debugger)
> >
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> > if (constraint != CONSTRAINT_NONE)
> > return;
> > }
> > + /* Do not panic for oom kills triggered by sysrq */
> > + if (oc->order == -1)
> > + return;
> > dump_header(oc, NULL, memcg);
> > panic("Out of memory: %s panic_on_oom is enabled\n",
> > sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> > @@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)
> >
> > p = select_bad_process(oc, &points, totalpages);
> > /* Found nothing?!?! Either we hang forever, or we panic. */
> > - if (!p) {
> > + if (!p && oc->order != -1) {
> > dump_header(oc, NULL, NULL);
> > panic("Out of memory and no killable processes...\n");
> > }
>
> Given sysctl_panic_on_oom checked, AFAICU there seems
> no chance for panic, no matter -1 or not.
>

I'm not sure I understand your point.

There are two oom killer panics: when panic_on_oom is enabled and when the
oom killer can't find an eligible process.

The change to the panic_on_oom panic is dealt with in check_panic_on_oom()
and the no eligible process panic is dealt with here.

If the sysctl is disabled, and there are no eligible processes to kill,
the change in behavior here is that we don't panic when triggered from
sysrq. That's the change in the hunk above.

> > - if (p != (void *)-1UL) {
> > + if (p && p != (void *)-1UL) {
> > oom_kill_process(oc, p, points, totalpages, NULL,
> > "Out of memory");
> > killed = 1;

2015-07-10 07:51:23

by Hillf Danton

[permalink] [raw]
Subject: RE: [patch v3 3/3] mm, oom: do not panic for oom kills triggered from sysrq

> > > diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
> > > --- a/Documentation/sysrq.txt
> > > +++ b/Documentation/sysrq.txt
> > > @@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
> > >
> > > 'e' - Send a SIGTERM to all processes, except for init.
> > >
> > > -'f' - Will call oom_kill to kill a memory hog process.
> > > +'f' - Will call the oom killer to kill a memory hog process, but do not
> > > + panic if nothing can be killed.
> > >
> > > 'g' - Used by kgdb (kernel debugger)
> > >
> > > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > > --- a/mm/oom_kill.c
> > > +++ b/mm/oom_kill.c
> > > @@ -607,6 +607,9 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
> > > if (constraint != CONSTRAINT_NONE)
> > > return;
> > > }
> > > + /* Do not panic for oom kills triggered by sysrq */
> > > + if (oc->order == -1)
> > > + return;
> > > dump_header(oc, NULL, memcg);
> > > panic("Out of memory: %s panic_on_oom is enabled\n",
> > > sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
> > > @@ -686,11 +689,11 @@ bool out_of_memory(struct oom_control *oc)
> > >
> > > p = select_bad_process(oc, &points, totalpages);
> > > /* Found nothing?!?! Either we hang forever, or we panic. */
> > > - if (!p) {
> > > + if (!p && oc->order != -1) {
> > > dump_header(oc, NULL, NULL);
> > > panic("Out of memory and no killable processes...\n");
> > > }
> >
> > Given sysctl_panic_on_oom checked, AFAICU there seems
> > no chance for panic, no matter -1 or not.
> >
>
> I'm not sure I understand your point.
>
> There are two oom killer panics: when panic_on_oom is enabled and when the
> oom killer can't find an eligible process.
>
> The change to the panic_on_oom panic is dealt with in check_panic_on_oom()
> and the no eligible process panic is dealt with here.
>
> If the sysctl is disabled, and there are no eligible processes to kill,
> the change in behavior here is that we don't panic when triggered from
> sysrq. That's the change in the hunk above.
>
When no eligible processes is selected to kill, we are sure that we skip one
panic in check_panic_on_oom(), and we have no clear reason to panic again.

But we can simply answer the caller that there is no page, and let her
decide what to do.

So I prefer to fold the two panic into one.

Hillf
> > > - if (p != (void *)-1UL) {
> > > + if (p && p != (void *)-1UL) {
> > > oom_kill_process(oc, p, points, totalpages, NULL,
> > > "Out of memory");
> > > killed = 1;

2015-07-14 21:16:22

by David Rientjes

[permalink] [raw]
Subject: RE: [patch v3 3/3] mm, oom: do not panic for oom kills triggered from sysrq

On Fri, 10 Jul 2015, Hillf Danton wrote:

> > I'm not sure I understand your point.
> >
> > There are two oom killer panics: when panic_on_oom is enabled and when the
> > oom killer can't find an eligible process.
> >
> > The change to the panic_on_oom panic is dealt with in check_panic_on_oom()
> > and the no eligible process panic is dealt with here.
> >
> > If the sysctl is disabled, and there are no eligible processes to kill,
> > the change in behavior here is that we don't panic when triggered from
> > sysrq. That's the change in the hunk above.
> >
> When no eligible processes is selected to kill, we are sure that we skip one
> panic in check_panic_on_oom(), and we have no clear reason to panic again.
>
> But we can simply answer the caller that there is no page, and let her
> decide what to do.
>
> So I prefer to fold the two panic into one.
>
> Hillf
> > > > - if (p != (void *)-1UL) {
> > > > + if (p && p != (void *)-1UL) {
> > > > oom_kill_process(oc, p, points, totalpages, NULL,
> > > > "Out of memory");
> > > > killed = 1;
>

I'm still not sure I understand your point, unfortunately. The new check:

if (!p && oc->order != -1) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}

ensures we never panic when called from sysrq. This is done because
userspace can easily race when there is a single eligible process to kill
that exits or is otherwise killed and the sysrq+f ends up panicking the
machine unexpectedly.

2015-07-14 22:52:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch v3 1/3] mm, oom: organize oom context into struct

On Wed, 8 Jul 2015 16:42:42 -0700 (PDT) David Rientjes <[email protected]> wrote:

> There are essential elements to an oom context that are passed around to
> multiple functions.
>
> Organize these elements into a new struct, struct oom_control, that
> specifies the context for an oom condition.
>
> This patch introduces no functional change.
>
> ...
>
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -12,6 +12,14 @@ struct notifier_block;
> struct mem_cgroup;
> struct task_struct;
>
> +struct oom_control {
> + struct zonelist *zonelist;
> + nodemask_t *nodemask;
> + gfp_t gfp_mask;
> + int order;
> + bool force_kill;
> +};

Some docs would be nice.

gfp_mask and order are what the page-allocating caller originally asked
for, I think? They haven't been mucked with?

It's somewhat obvious what force_kill does, but why is it provided, why
is it set? And what does it actually kill? A process which was
selected based on the other fields...

Also, it's a bit odd that zonelist and nodemask are here. They're
low-level implementation details whereas the other three fields are
high-level caller control stuff.

Anyway, please have a think about it. The definition site for a controlling
structure can be a great place to reveal the overall design and operation.

2015-07-14 23:44:30

by David Rientjes

[permalink] [raw]
Subject: Re: [patch v3 1/3] mm, oom: organize oom context into struct

On Tue, 14 Jul 2015, Andrew Morton wrote:

> > --- a/include/linux/oom.h
> > +++ b/include/linux/oom.h
> > @@ -12,6 +12,14 @@ struct notifier_block;
> > struct mem_cgroup;
> > struct task_struct;
> >
> > +struct oom_control {
> > + struct zonelist *zonelist;
> > + nodemask_t *nodemask;
> > + gfp_t gfp_mask;
> > + int order;
> > + bool force_kill;
> > +};
>
> Some docs would be nice.
>

Ok!

> gfp_mask and order are what the page-allocating caller originally asked
> for, I think? They haven't been mucked with?
>

Yes, it's a good opportunity to make them const.

> It's somewhat obvious what force_kill does, but why is it provided, why
> is it set? And what does it actually kill? A process which was
> selected based on the other fields...
>

It's removed in the next patch since it's unneeded, so I'll define what
order == -1 means.

> Also, it's a bit odd that zonelist and nodemask are here. They're
> low-level implementation details whereas the other three fields are
> high-level caller control stuff.
>

Zonelist and nodemask are indeed pretty weird here. We use them to
determine if the oom kill is constrained by cpuset and/or mempolicy,
respectively so we don't kill things unnecessarily and leave a cpuset
still oom, for example. We could determine that before actually calling
the oom killer and passing the enum oom_constraint in, but its purpose is
for the oom killer so it's just a part of that logical unit.