Under very heavy multi-process workloads, like AIM7, the VM can
get into trouble in a variety of ways. The trouble start when
there are hundreds, or even thousands of processes active in the
page reclaim code.
Not only can the system suffer enormous slowdowns because of
lock contention (and conditional reschedules) between thousands
of processes in the page reclaim code, but each process will try
to free up to SWAP_CLUSTER_MAX pages, even when the system already
has lots of memory free.
It should be possible to avoid both of those issues at once, by
simply limiting how many processes are active in the page reclaim
code simultaneously.
If too many processes are active doing page reclaim in one zone,
simply go to sleep in shrink_zone().
On wakeup, check whether enough memory has been freed already
before jumping into the page reclaim code ourselves. We want
to use the same threshold here that is used in the page allocator
for deciding whether or not to call the page reclaim code in the
first place, otherwise some unlucky processes could end up freeing
memory for the rest of the system.
Reported-by: Larry Woodman <[email protected]>
Signed-off-by: Rik van Riel <[email protected]>
---
v2:
- fix typos in sysctl.c and vm.txt
- move the code in sysctl.c out from under the ifdef
- only __GFP_FS|__GFP_IO tasks can wait
Documentation/sysctl/vm.txt | 18 ++++++++++++++
include/linux/mmzone.h | 4 +++
include/linux/swap.h | 1 +
kernel/sysctl.c | 7 +++++
mm/page_alloc.c | 3 ++
mm/vmscan.c | 40 +++++++++++++++++++++++++++++++++
6 files changed, 73 insertions(+), 0 deletions(-)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index fc5790d..8bd1a96 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
- legacy_va_layout
- lowmem_reserve_ratio
- max_map_count
+- max_zone_concurrent_reclaimers
- memory_failure_early_kill
- memory_failure_recovery
- min_free_kbytes
@@ -278,6 +279,23 @@ The default value is 65536.
=============================================================
+max_zone_concurrent_reclaimers:
+
+The number of processes that are allowed to simultaneously reclaim
+memory from a particular memory zone.
+
+With certain workloads, hundreds of processes end up in the page
+reclaim code simultaneously. This can cause large slowdowns due
+to lock contention, freeing of way too much memory and occasionally
+false OOM kills.
+
+To avoid these problems, only allow a smaller number of processes
+to reclaim pages from each memory zone simultaneously.
+
+The default value is 8.
+
+=============================================================
+
memory_failure_early_kill:
Control how to kill processes when uncorrected memory error (typically
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 30fe668..ed614b8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -345,6 +345,10 @@ struct zone {
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+ /* Number of processes running page reclaim code on this zone. */
+ atomic_t concurrent_reclaimers;
+ wait_queue_head_t reclaim_wait;
+
/*
* prev_priority holds the scanning priority for this zone. It is
* defined as the scanning priority at which we achieved our reclaim
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2602a8..661eec7 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,6 +254,7 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
extern long vm_total_pages;
+extern int max_zone_concurrent_reclaimers;
#ifdef CONFIG_NUMA
extern int zone_reclaim_mode;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6ff0ae6..4ec17ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1271,6 +1271,13 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "max_zone_concurrent_reclaimers",
+ .data = &max_zone_concurrent_reclaimers,
+ .maxlen = sizeof(max_zone_concurrent_reclaimers),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 11ae66e..ca9cae1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3852,6 +3852,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->prev_priority = DEF_PRIORITY;
+ atomic_set(&zone->concurrent_reclaimers, 0);
+ init_waitqueue_head(&zone->reclaim_wait);
+
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bbee91..ecfe28c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -40,6 +40,7 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/wait.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -129,6 +130,17 @@ struct scan_control {
int vm_swappiness = 60;
long vm_total_pages; /* The total number of pages which the VM controls */
+/*
+ * Maximum number of processes concurrently running the page
+ * reclaim code in a memory zone. Having too many processes
+ * just results in them burning CPU time waiting for locks,
+ * so we're better off limiting page reclaim to a sane number
+ * of processes at a time. We do this per zone so local node
+ * reclaim on one NUMA node will not block other nodes from
+ * making progress.
+ */
+int max_zone_concurrent_reclaimers = 8;
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -1600,6 +1612,31 @@ static void shrink_zone(int priority, struct zone *zone,
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int noswap = 0;
+ if (!current_is_kswapd() && atomic_read(&zone->concurrent_reclaimers) >
+ max_zone_concurrent_reclaimers &&
+ (sc->gfp_mask & (__GFP_IO|__GFP_FS)) ==
+ (__GFP_IO|__GFP_FS)) {
+ /*
+ * Do not add to the lock contention if this zone has
+ * enough processes doing page reclaim already, since
+ * we would just make things slower.
+ */
+ sleep_on(&zone->reclaim_wait);
+
+ /*
+ * If other processes freed enough memory while we waited,
+ * break out of the loop and go back to the allocator.
+ */
+ if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
+ 0, 0)) {
+ wake_up(&zone->reclaim_wait);
+ sc->nr_reclaimed += nr_to_reclaim;
+ return;
+ }
+ }
+
+ atomic_inc(&zone->concurrent_reclaimers);
+
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
noswap = 1;
@@ -1655,6 +1692,9 @@ static void shrink_zone(int priority, struct zone *zone,
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
+
+ atomic_dec(&zone->concurrent_reclaimers);
+ wake_up(&zone->reclaim_wait);
}
/*
Hi, Rik.
On Sat, Dec 12, 2009 at 6:46 AM, Rik van Riel <[email protected]> wrote:
> Under very heavy multi-process workloads, like AIM7, the VM can
> get into trouble in a variety of ways. The trouble start when
> there are hundreds, or even thousands of processes active in the
> page reclaim code.
>
> Not only can the system suffer enormous slowdowns because of
> lock contention (and conditional reschedules) between thousands
> of processes in the page reclaim code, but each process will try
> to free up to SWAP_CLUSTER_MAX pages, even when the system already
> has lots of memory free.
>
> It should be possible to avoid both of those issues at once, by
> simply limiting how many processes are active in the page reclaim
> code simultaneously.
>
> If too many processes are active doing page reclaim in one zone,
> simply go to sleep in shrink_zone().
>
> On wakeup, check whether enough memory has been freed already
> before jumping into the page reclaim code ourselves. We want
> to use the same threshold here that is used in the page allocator
> for deciding whether or not to call the page reclaim code in the
> first place, otherwise some unlucky processes could end up freeing
> memory for the rest of the system.
I am worried about one.
Now, we can put too many processes reclaim_wait with NR_UNINTERRUBTIBLE state.
If OOM happens, OOM will kill many innocent processes since
uninterruptible task
can't handle kill signal until the processes free from reclaim_wait list.
I think reclaim_wait list staying time might be long if VM pressure is heavy.
Is this a exaggeration?
If it is serious problem, how about this?
We add new PF_RECLAIM_BLOCK flag and don't pick the process
in select_bad_process.
--
Kind regards,
Minchan Kim
On 12/13/2009 07:14 PM, Minchan Kim wrote:
> On Sat, Dec 12, 2009 at 6:46 AM, Rik van Riel<[email protected]> wrote:
>> If too many processes are active doing page reclaim in one zone,
>> simply go to sleep in shrink_zone().
> I am worried about one.
>
> Now, we can put too many processes reclaim_wait with NR_UNINTERRUBTIBLE state.
> If OOM happens, OOM will kill many innocent processes since
> uninterruptible task
> can't handle kill signal until the processes free from reclaim_wait list.
>
> I think reclaim_wait list staying time might be long if VM pressure is heavy.
> Is this a exaggeration?
>
> If it is serious problem, how about this?
>
> We add new PF_RECLAIM_BLOCK flag and don't pick the process
> in select_bad_process.
A simpler solution may be to use sleep_on_interruptible, and
simply have the process continue into shrink_zone() if it
gets a signal.
--
All rights reversed.
On Mon, Dec 14, 2009 at 1:09 PM, Rik van Riel <[email protected]> wrote:
> On 12/13/2009 07:14 PM, Minchan Kim wrote:
>
>> On Sat, Dec 12, 2009 at 6:46 AM, Rik van Riel<[email protected]> wrote:
>
>>> If too many processes are active doing page reclaim in one zone,
>>> simply go to sleep in shrink_zone().
>
>> I am worried about one.
>>
>> Now, we can put too many processes reclaim_wait with NR_UNINTERRUBTIBLE
>> state.
>> If OOM happens, OOM will kill many innocent processes since
>> uninterruptible task
>> can't handle kill signal until the processes free from reclaim_wait list.
>>
>> I think reclaim_wait list staying time might be long if VM pressure is
>> heavy.
>> Is this a exaggeration?
>>
>> If it is serious problem, how about this?
>>
>> We add new PF_RECLAIM_BLOCK flag and don't pick the process
>> in select_bad_process.
>
> A simpler solution may be to use sleep_on_interruptible, and
> simply have the process continue into shrink_zone() if it
> gets a signal.
I thought it but I was not sure.
Okay. If it is possible, It' more simple.
Could you repost patch with that?
Sorry but I have one requesting.
===
+The default value is 8.
+
+=============================================================
I like this. but why do you select default value as constant 8?
Do you have any reason?
I think it would be better to select the number proportional to NR_CPU.
ex) NR_CPU * 2 or something.
Otherwise looks good to me.
Pessimistically, I assume that the pageout code spends maybe
10% of its time on locking (we have seen far, far worse than
this with thousands of processes in the pageout code). That
means if we have more than 10 threads in the pageout code,
we could end up spending more time on locking and less doing
real work - slowing everybody down.
I rounded it down to the closest power of 2 to come up with
an arbitrary number that looked safe :)
===
We discussed above.
I want to add your desciption into changelog.
That's because after long time, We don't know why we select '8' as
default value.
Your desciption in changelog will explain it to follow-up people. :)
Sorry for bothering you.
> --
> All rights reversed.
>
--
Kind regards,
Minchan Kim
On 12/13/2009 11:19 PM, Minchan Kim wrote:
> On Mon, Dec 14, 2009 at 1:09 PM, Rik van Riel<[email protected]> wrote:
>> A simpler solution may be to use sleep_on_interruptible, and
>> simply have the process continue into shrink_zone() if it
>> gets a signal.
>
> I thought it but I was not sure.
> Okay. If it is possible, It' more simple.
> Could you repost patch with that?
Sure, not a problem.
> +The default value is 8.
> +
> +=============================================================
>
>
> I like this. but why do you select default value as constant 8?
> Do you have any reason?
>
> I think it would be better to select the number proportional to NR_CPU.
> ex) NR_CPU * 2 or something.
>
> Otherwise looks good to me.
>
>
> Pessimistically, I assume that the pageout code spends maybe
> 10% of its time on locking (we have seen far, far worse than
> this with thousands of processes in the pageout code). That
> means if we have more than 10 threads in the pageout code,
> we could end up spending more time on locking and less doing
> real work - slowing everybody down.
>
> I rounded it down to the closest power of 2 to come up with
> an arbitrary number that looked safe :)
> ===
>
> We discussed above.
> I want to add your desciption into changelog.
The thing is, I don't know if 8 is the best value for
the default, which is a reason I made it tunable in
the first place.
There are a lot of assumptions in my reasoning, and
they may be wrong. I suspect that documenting something
wrong is probably worse than letting people wonder out
the default (and maybe finding a better one).
--
All rights reversed.
On Mon, Dec 14, 2009 at 1:29 PM, Rik van Riel <[email protected]> wrote:
> On 12/13/2009 11:19 PM, Minchan Kim wrote:
>>
>> On Mon, Dec 14, 2009 at 1:09 PM, Rik van Riel<[email protected]> wrote:
>
>>> A simpler solution may be to use sleep_on_interruptible, and
>>> simply have the process continue into shrink_zone() if it
>>> gets a signal.
>>
>> I thought it but I was not sure.
>> Okay. If it is possible, It' more simple.
>> Could you repost patch with that?
>
> Sure, not a problem.
>
>> +The default value is 8.
>> +
>> +=============================================================
>>
>>
>> I like this. but why do you select default value as constant 8?
>> Do you have any reason?
>>
>> I think it would be better to select the number proportional to
>> NR_CPU.
>> ex) NR_CPU * 2 or something.
>>
>> Otherwise looks good to me.
>>
>>
>> Pessimistically, I assume that the pageout code spends maybe
>> 10% of its time on locking (we have seen far, far worse than
>> this with thousands of processes in the pageout code). That
>> means if we have more than 10 threads in the pageout code,
>> we could end up spending more time on locking and less doing
>> real work - slowing everybody down.
>>
>> I rounded it down to the closest power of 2 to come up with
>> an arbitrary number that looked safe :)
>> ===
>>
>> We discussed above.
>> I want to add your desciption into changelog.
>
> The thing is, I don't know if 8 is the best value for
> the default, which is a reason I made it tunable in
> the first place.
>
> There are a lot of assumptions in my reasoning, and
> they may be wrong. I suspect that documenting something
> wrong is probably worse than letting people wonder out
> the default (and maybe finding a better one).
Indeed. But whenever I see magic values in kernel, I have a question
about that.
Actually I used to doubt the value because I guess
"that value was determined by server or desktop experiments.
so probably it don't proper small system."
At least if we put the logical why which might be wrong,
later people can think that value is not proper any more now or his
system(ex, small or super computer and so on) by based on our
description.
so they can improve it.
I think it isn't important your reasoning is right or wrong.
Most important thing is which logical reason determines that value.
I want to not bother you if you mind my suggestion.
Pz, think it was just nitpick. :)
> --
> All rights reversed.
>
--
Kind regards,
Minchan Kim
> Under very heavy multi-process workloads, like AIM7, the VM can
> get into trouble in a variety of ways. The trouble start when
> there are hundreds, or even thousands of processes active in the
> page reclaim code.
>
> Not only can the system suffer enormous slowdowns because of
> lock contention (and conditional reschedules) between thousands
> of processes in the page reclaim code, but each process will try
> to free up to SWAP_CLUSTER_MAX pages, even when the system already
> has lots of memory free.
>
> It should be possible to avoid both of those issues at once, by
> simply limiting how many processes are active in the page reclaim
> code simultaneously.
>
> If too many processes are active doing page reclaim in one zone,
> simply go to sleep in shrink_zone().
>
> On wakeup, check whether enough memory has been freed already
> before jumping into the page reclaim code ourselves. We want
> to use the same threshold here that is used in the page allocator
> for deciding whether or not to call the page reclaim code in the
> first place, otherwise some unlucky processes could end up freeing
> memory for the rest of the system.
This patch seems very similar to my old effort. afaik, there are another
two benefit.
1. Improve resource gurantee
if thousands tasks start to vmscan at the same time, they eat all memory for
PF_MEMALLOC. it might cause another dangerous problem. some filesystem
and io device don't handle allocation failure properly.
2. Improve OOM contidion behavior
Currently, vmscan don't handle SIGKILL at all. then if the system
have hevy memory pressure, OOM killer can't kill the target process
soon. it might cause OOM killer kill next innocent process.
This patch can fix it.
> @@ -1600,6 +1612,31 @@ static void shrink_zone(int priority, struct zone *zone,
> struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
> int noswap = 0;
>
> + if (!current_is_kswapd() && atomic_read(&zone->concurrent_reclaimers) >
> + max_zone_concurrent_reclaimers &&
> + (sc->gfp_mask & (__GFP_IO|__GFP_FS)) ==
> + (__GFP_IO|__GFP_FS)) {
> + /*
> + * Do not add to the lock contention if this zone has
> + * enough processes doing page reclaim already, since
> + * we would just make things slower.
> + */
> + sleep_on(&zone->reclaim_wait);
Oops. this is bug. sleep_on() is not SMP safe.
I made few fixing patch today. I'll post it soon.
btw, following is mesurement result by hackbench.
================
unit: sec
parameter old new
130 (5200 processes) 5.463 4.442
140 (5600 processes) 479.357 7.792
150 (6000 processes) 729.640 20.529
concurrent_reclaimers limitation related code made mess to shrink_zone.
To introduce helper function increase readability.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 35 insertions(+), 23 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ecfe28c..74c36fe 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1597,25 +1597,11 @@ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
return nr;
}
-/*
- * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
- */
-static void shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
{
- unsigned long nr[NR_LRU_LISTS];
- unsigned long nr_to_scan;
- unsigned long percent[2]; /* anon @ 0; file @ 1 */
- enum lru_list l;
- unsigned long nr_reclaimed = sc->nr_reclaimed;
- unsigned long nr_to_reclaim = sc->nr_to_reclaim;
- struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
- int noswap = 0;
-
- if (!current_is_kswapd() && atomic_read(&zone->concurrent_reclaimers) >
- max_zone_concurrent_reclaimers &&
- (sc->gfp_mask & (__GFP_IO|__GFP_FS)) ==
- (__GFP_IO|__GFP_FS)) {
+ if (!current_is_kswapd() &&
+ atomic_read(&zone->concurrent_reclaimers) > max_zone_concurrent_reclaimers &&
+ (sc->gfp_mask & (__GFP_IO|__GFP_FS)) == (__GFP_IO|__GFP_FS)) {
/*
* Do not add to the lock contention if this zone has
* enough processes doing page reclaim already, since
@@ -1630,12 +1616,40 @@ static void shrink_zone(int priority, struct zone *zone,
if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
0, 0)) {
wake_up(&zone->reclaim_wait);
- sc->nr_reclaimed += nr_to_reclaim;
- return;
+ sc->nr_reclaimed += sc->nr_to_reclaim;
+ return -ERESTARTSYS;
}
}
atomic_inc(&zone->concurrent_reclaimers);
+ return 0;
+}
+
+static void shrink_zone_end(struct zone *zone, struct scan_control *sc)
+{
+ atomic_dec(&zone->concurrent_reclaimers);
+ wake_up(&zone->reclaim_wait);
+}
+
+/*
+ * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
+ */
+static void shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
+{
+ unsigned long nr[NR_LRU_LISTS];
+ unsigned long nr_to_scan;
+ unsigned long percent[2]; /* anon @ 0; file @ 1 */
+ enum lru_list l;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ int noswap = 0;
+ int ret;
+
+ ret = shrink_zone_begin(zone, sc);
+ if (ret)
+ return;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1692,9 +1706,7 @@ static void shrink_zone(int priority, struct zone *zone,
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
-
- atomic_dec(&zone->concurrent_reclaimers);
- wake_up(&zone->reclaim_wait);
+ shrink_zone_end(zone, sc);
}
/*
--
1.6.5.2
sleep_on() function is SMP and/or kernel preemption unsafe. we shouldn't
use it on new code.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
include/linux/wait.h | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b..bf76627 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -427,11 +427,11 @@ static inline void remove_wait_queue_locked(wait_queue_head_t *q,
* They are racy. DO NOT use them, use the wait_event* interfaces above.
* We plan to remove these interfaces.
*/
-extern void sleep_on(wait_queue_head_t *q);
-extern long sleep_on_timeout(wait_queue_head_t *q,
+extern void __deprecated sleep_on(wait_queue_head_t *q);
+extern long __deprecated sleep_on_timeout(wait_queue_head_t *q,
signed long timeout);
-extern void interruptible_sleep_on(wait_queue_head_t *q);
-extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
+extern void __deprecated interruptible_sleep_on(wait_queue_head_t *q);
+extern long __deprecated interruptible_sleep_on_timeout(wait_queue_head_t *q,
signed long timeout);
/*
--
1.6.5.2
sleep_on() is SMP and/or kernel preemption unsafe. This patch
replace it with safe code.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 40 ++++++++++++++++++++++++++++++----------
1 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 74c36fe..3be5345 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1599,15 +1599,32 @@ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
{
- if (!current_is_kswapd() &&
- atomic_read(&zone->concurrent_reclaimers) > max_zone_concurrent_reclaimers &&
- (sc->gfp_mask & (__GFP_IO|__GFP_FS)) == (__GFP_IO|__GFP_FS)) {
- /*
- * Do not add to the lock contention if this zone has
- * enough processes doing page reclaim already, since
- * we would just make things slower.
- */
- sleep_on(&zone->reclaim_wait);
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wq = &zone->reclaim_wait;
+
+ if (current_is_kswapd())
+ goto out;
+
+ /*
+ * GFP_NOIO and GFP_NOFS mean caller may have some lock implicitly.
+ * Thus, we can't wait here. otherwise it might cause deadlock.
+ */
+ if ((sc->gfp_mask & (__GFP_IO|__GFP_FS)) != (__GFP_IO|__GFP_FS))
+ goto out;
+
+ /*
+ * Do not add to the lock contention if this zone has
+ * enough processes doing page reclaim already, since
+ * we would just make things slower.
+ */
+ for (;;) {
+ prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (atomic_read(&zone->concurrent_reclaimers) <=
+ max_zone_concurrent_reclaimers)
+ break;
+
+ schedule();
/*
* If other processes freed enough memory while we waited,
@@ -1615,12 +1632,15 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
*/
if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
0, 0)) {
- wake_up(&zone->reclaim_wait);
+ wake_up(wq);
+ finish_wait(wq, &wait);
sc->nr_reclaimed += sc->nr_to_reclaim;
return -ERESTARTSYS;
}
}
+ finish_wait(wq, &wait);
+ out:
atomic_inc(&zone->concurrent_reclaimers);
return 0;
}
--
1.6.5.2
if we don't use exclusive queue, wake_up() function wake _all_ waited
task. This is simply cpu wasting.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e0cb834..3562a2d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1618,7 +1618,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
* we would just make things slower.
*/
for (;;) {
- prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait_exclusive(wq, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(&zone->concurrent_reclaimers) <=
max_zone_concurrent_reclaimers)
@@ -1632,7 +1632,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
*/
if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
0, 0)) {
- wake_up(wq);
+ wake_up_all(wq);
finish_wait(wq, &wait);
sc->nr_reclaimed += sc->nr_to_reclaim;
return -ERESTARTSYS;
--
1.6.5.2
All task sleeping point in vmscan (e.g. congestion_wait) use
io_schedule. then shrink_zone_begin use it too.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3562a2d..0880668 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1624,7 +1624,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
max_zone_concurrent_reclaimers)
break;
- schedule();
+ io_schedule();
/*
* If other processes freed enough memory while we waited,
--
1.6.5.2
>From latency view, There isn't any reason shrink_zones() continue to
reclaim another zone's page if the task reclaimed enough lots pages.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 16 ++++++++++++----
1 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0880668..bf229d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1654,7 +1654,7 @@ static void shrink_zone_end(struct zone *zone, struct scan_control *sc)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_zone(int priority, struct zone *zone,
+static int shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -1669,7 +1669,7 @@ static void shrink_zone(int priority, struct zone *zone,
ret = shrink_zone_begin(zone, sc);
if (ret)
- return;
+ return ret;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1692,6 +1692,7 @@ static void shrink_zone(int priority, struct zone *zone,
&reclaim_stat->nr_saved_scan[l]);
}
+ ret = 0;
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -1712,8 +1713,10 @@ static void shrink_zone(int priority, struct zone *zone,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
+ if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) {
+ ret = -ERESTARTSYS;
break;
+ }
}
sc->nr_reclaimed = nr_reclaimed;
@@ -1727,6 +1730,8 @@ static void shrink_zone(int priority, struct zone *zone,
throttle_vm_writeout(sc->gfp_mask);
shrink_zone_end(zone, sc);
+
+ return ret;
}
/*
@@ -1751,6 +1756,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
+ int ret;
sc->all_unreclaimable = 1;
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
@@ -1780,7 +1786,9 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
priority);
}
- shrink_zone(priority, zone, sc);
+ ret = shrink_zone(priority, zone, sc);
+ if (ret)
+ return;
}
}
--
1.6.5.2
When fork bomb invoke OOM Killer, almost task might start to reclaim and
sleep on shrink_zone_begin(). if we use TASK_UNINTERRUPTIBLE, OOM killer
can't kill such task. it mean we never recover from fork bomb.
This patch fixes it.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 20 +++++++++++++-------
1 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf229d3..e537361 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1618,7 +1618,10 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
* we would just make things slower.
*/
for (;;) {
- prepare_to_wait_exclusive(wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait_exclusive(wq, &wait, TASK_KILLABLE);
+
+ if (fatal_signal_pending(current))
+ goto stop_reclaim;
if (atomic_read(&zone->concurrent_reclaimers) <=
max_zone_concurrent_reclaimers)
@@ -1631,18 +1634,21 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
* break out of the loop and go back to the allocator.
*/
if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
- 0, 0)) {
- wake_up_all(wq);
- finish_wait(wq, &wait);
- sc->nr_reclaimed += sc->nr_to_reclaim;
- return -ERESTARTSYS;
- }
+ 0, 0))
+ goto found_lots_memory;
}
finish_wait(wq, &wait);
out:
atomic_inc(&zone->concurrent_reclaimers);
return 0;
+
+ found_lots_memory:
+ wake_up_all(wq);
+ stop_reclaim:
+ finish_wait(wq, &wait);
+ sc->nr_reclaimed += sc->nr_to_reclaim;
+ return -ERESTARTSYS;
}
static void shrink_zone_end(struct zone *zone, struct scan_control *sc)
--
1.6.5.2
In OOM case, almost processes may be in vmscan. There isn't any reason
the killed process continue allocation. process exiting free lots pages
rather than greedy vmscan.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/page_alloc.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ca9cae1..8a9cbaa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1878,6 +1878,14 @@ rebalance:
goto got_pg;
/*
+ * If the allocation is for userland page and we have fatal signal,
+ * there isn't any reason to continue allocation. instead, the task
+ * should exit soon.
+ */
+ if (fatal_signal_pending(current) && (gfp_mask & __GFP_HIGHMEM))
+ goto nopage;
+
+ /*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
--
1.6.5.2
> btw, following is mesurement result by hackbench.
> ================
>
> unit: sec
>
> parameter old new
> 130 (5200 processes) 5.463 4.442
> 140 (5600 processes) 479.357 7.792
> 150 (6000 processes) 729.640 20.529
old mean mmotm1208
new mean mmotm1208 + Rik patch + my patch
On Mon, Dec 14, 2009 at 09:24:40PM +0900, KOSAKI Motohiro wrote:
>
>
> sleep_on() function is SMP and/or kernel preemption unsafe. we shouldn't
> use it on new code.
And the best way to archive this is to remove the function.
In Linus' current tree I find:
- 5 instances of sleep_on(), all in old and obscure block drivers
- 2 instances of sleep_on_timeout(), both in old and obscure drivers
- 28 instances of interruptible_sleep_on_timeout(), mostly in obscure
drivers with a high concentration in the old oss core which should be
killed anyway. And unfortunately a few relatively recent additions
like the SGI xpc driver or usbvision driver
- tons of instances of interruptible_sleep_on all over the drivers code
So I think you're be better off just killing the first two ASAP instead
of just deprecating them.
for the other two deprecating and removing some of the horrible drivers
still using them might be best.
On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> if we don't use exclusive queue, wake_up() function wake _all_ waited
> task. This is simply cpu wasting.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> 0, 0)) {
> - wake_up(wq);
> + wake_up_all(wq);
> finish_wait(wq,&wait);
> sc->nr_reclaimed += sc->nr_to_reclaim;
> return -ERESTARTSYS;
I believe we want to wake the processes up one at a time
here. If the queue of waiting processes is very large
and the amount of excess free memory is fairly low, the
first processes that wake up can take the amount of free
memory back down below the threshold. The rest of the
waiters should stay asleep when this happens.
--
All rights reversed.
On 12/14/2009 07:23 AM, KOSAKI Motohiro wrote:
> concurrent_reclaimers limitation related code made mess to shrink_zone.
> To introduce helper function increase readability.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:24 AM, KOSAKI Motohiro wrote:
>
>
> sleep_on() function is SMP and/or kernel preemption unsafe. we shouldn't
> use it on new code.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:29 AM, KOSAKI Motohiro wrote:
> sleep_on() is SMP and/or kernel preemption unsafe. This patch
> replace it with safe code.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> All task sleeping point in vmscan (e.g. congestion_wait) use
> io_schedule. then shrink_zone_begin use it too.
I'm not sure we really are in io wait when waiting on this
queue, but there's a fair chance we may be so this is a
reasonable change.
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Acked-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:31 AM, KOSAKI Motohiro wrote:
>
> From latency view, There isn't any reason shrink_zones() continue to
> reclaim another zone's page if the task reclaimed enough lots pages.
IIRC there is one reason - keeping equal pageout pressure
between zones.
However, it may be enough if just kswapd keeps evening out
the pressure, now that we limit the number of concurrent
direct reclaimers in the system.
Since kswapd does not use shrink_zones ...
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:32 AM, KOSAKI Motohiro wrote:
> When fork bomb invoke OOM Killer, almost task might start to reclaim and
> sleep on shrink_zone_begin(). if we use TASK_UNINTERRUPTIBLE, OOM killer
> can't kill such task. it mean we never recover from fork bomb.
>
> This patch fixes it.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
As with patch 4/8 I am not convinced that wake_up_all is
the correct thing to do.
Other than that:
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On 12/14/2009 07:32 AM, KOSAKI Motohiro wrote:
> In OOM case, almost processes may be in vmscan. There isn't any reason
> the killed process continue allocation. process exiting free lots pages
> rather than greedy vmscan.
>
> Signed-off-by: KOSAKI Motohiro<[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
--
All rights reversed.
On Mon, 14 Dec 2009 08:03:02 -0500
Christoph Hellwig <[email protected]> wrote:
> On Mon, Dec 14, 2009 at 09:24:40PM +0900, KOSAKI Motohiro wrote:
> >
> >
> > sleep_on() function is SMP and/or kernel preemption unsafe. we
> > shouldn't use it on new code.
>
> And the best way to archive this is to remove the function.
>
> In Linus' current tree I find:
>
> - 5 instances of sleep_on(), all in old and obscure block drivers
> - 2 instances of sleep_on_timeout(), both in old and obscure drivers
these should both die; the sleep_on() ones using BROKEN in Kconfig..
.. sleep_on() has not worked in the 2.6 series ever.... ;)
>
> - 28 instances of interruptible_sleep_on_timeout(), mostly in obscure
> drivers with a high concentration in the old oss core which should
> be killed anyway. And unfortunately a few relatively recent additions
> like the SGI xpc driver or usbvision driver
can we also make sure that checkpatch.pl catches any new addition?
(not saying checkpatch.pl is the end-all, but the people who do run it
at least have now have a chance ;-)
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
On Fri, 2009-12-11 at 16:46 -0500, Rik van Riel wrote:
Rik, the latest patch appears to have a problem although I dont know
what the problem is yet. When the system ran out of memory we see
thousands of runnable processes and 100% system time:
9420 2 29824 79856 62676 19564 0 0 0 0 8054 379 0
100 0 0 0
9420 2 29824 79368 62292 19564 0 0 0 0 8691 413 0
100 0 0 0
9421 1 29824 79780 61780 19820 0 0 0 0 8928 408 0
100 0 0 0
The system would not respond so I dont know whats going on yet. I'll
add debug code to figure out why its in that state as soon as I get
access to the hardware.
Larry
> Under very heavy multi-process workloads, like AIM7, the VM can
> get into trouble in a variety of ways. The trouble start when
> there are hundreds, or even thousands of processes active in the
> page reclaim code.
>
> Not only can the system suffer enormous slowdowns because of
> lock contention (and conditional reschedules) between thousands
> of processes in the page reclaim code, but each process will try
> to free up to SWAP_CLUSTER_MAX pages, even when the system already
> has lots of memory free.
>
> It should be possible to avoid both of those issues at once, by
> simply limiting how many processes are active in the page reclaim
> code simultaneously.
>
> If too many processes are active doing page reclaim in one zone,
> simply go to sleep in shrink_zone().
>
> On wakeup, check whether enough memory has been freed already
> before jumping into the page reclaim code ourselves. We want
> to use the same threshold here that is used in the page allocator
> for deciding whether or not to call the page reclaim code in the
> first place, otherwise some unlucky processes could end up freeing
> memory for the rest of the system.
>
> Reported-by: Larry Woodman <[email protected]>
> Signed-off-by: Rik van Riel <[email protected]>
>
> ---
> v2:
> - fix typos in sysctl.c and vm.txt
> - move the code in sysctl.c out from under the ifdef
> - only __GFP_FS|__GFP_IO tasks can wait
>
> Documentation/sysctl/vm.txt | 18 ++++++++++++++
> include/linux/mmzone.h | 4 +++
> include/linux/swap.h | 1 +
> kernel/sysctl.c | 7 +++++
> mm/page_alloc.c | 3 ++
> mm/vmscan.c | 40 +++++++++++++++++++++++++++++++++
> 6 files changed, 73 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index fc5790d..8bd1a96 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
> - legacy_va_layout
> - lowmem_reserve_ratio
> - max_map_count
> +- max_zone_concurrent_reclaimers
> - memory_failure_early_kill
> - memory_failure_recovery
> - min_free_kbytes
> @@ -278,6 +279,23 @@ The default value is 65536.
>
> =============================================================
>
> +max_zone_concurrent_reclaimers:
> +
> +The number of processes that are allowed to simultaneously reclaim
> +memory from a particular memory zone.
> +
> +With certain workloads, hundreds of processes end up in the page
> +reclaim code simultaneously. This can cause large slowdowns due
> +to lock contention, freeing of way too much memory and occasionally
> +false OOM kills.
> +
> +To avoid these problems, only allow a smaller number of processes
> +to reclaim pages from each memory zone simultaneously.
> +
> +The default value is 8.
> +
> +=============================================================
> +
> memory_failure_early_kill:
>
> Control how to kill processes when uncorrected memory error (typically
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 30fe668..ed614b8 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -345,6 +345,10 @@ struct zone {
> /* Zone statistics */
> atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
>
> + /* Number of processes running page reclaim code on this zone. */
> + atomic_t concurrent_reclaimers;
> + wait_queue_head_t reclaim_wait;
> +
> /*
> * prev_priority holds the scanning priority for this zone. It is
> * defined as the scanning priority at which we achieved our reclaim
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index a2602a8..661eec7 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -254,6 +254,7 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
> extern int vm_swappiness;
> extern int remove_mapping(struct address_space *mapping, struct page *page);
> extern long vm_total_pages;
> +extern int max_zone_concurrent_reclaimers;
>
> #ifdef CONFIG_NUMA
> extern int zone_reclaim_mode;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 6ff0ae6..4ec17ed 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1271,6 +1271,13 @@ static struct ctl_table vm_table[] = {
> .extra2 = &one,
> },
> #endif
> + {
> + .procname = "max_zone_concurrent_reclaimers",
> + .data = &max_zone_concurrent_reclaimers,
> + .maxlen = sizeof(max_zone_concurrent_reclaimers),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec,
> + },
>
> /*
> * NOTE: do not add new entries to this table unless you have read
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 11ae66e..ca9cae1 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3852,6 +3852,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
> zone->prev_priority = DEF_PRIORITY;
>
> + atomic_set(&zone->concurrent_reclaimers, 0);
> + init_waitqueue_head(&zone->reclaim_wait);
> +
> zone_pcp_init(zone);
> for_each_lru(l) {
> INIT_LIST_HEAD(&zone->lru[l].list);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2bbee91..ecfe28c 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -40,6 +40,7 @@
> #include <linux/memcontrol.h>
> #include <linux/delayacct.h>
> #include <linux/sysctl.h>
> +#include <linux/wait.h>
>
> #include <asm/tlbflush.h>
> #include <asm/div64.h>
> @@ -129,6 +130,17 @@ struct scan_control {
> int vm_swappiness = 60;
> long vm_total_pages; /* The total number of pages which the VM controls */
>
> +/*
> + * Maximum number of processes concurrently running the page
> + * reclaim code in a memory zone. Having too many processes
> + * just results in them burning CPU time waiting for locks,
> + * so we're better off limiting page reclaim to a sane number
> + * of processes at a time. We do this per zone so local node
> + * reclaim on one NUMA node will not block other nodes from
> + * making progress.
> + */
> +int max_zone_concurrent_reclaimers = 8;
> +
> static LIST_HEAD(shrinker_list);
> static DECLARE_RWSEM(shrinker_rwsem);
>
> @@ -1600,6 +1612,31 @@ static void shrink_zone(int priority, struct zone *zone,
> struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
> int noswap = 0;
>
> + if (!current_is_kswapd() && atomic_read(&zone->concurrent_reclaimers) >
> + max_zone_concurrent_reclaimers &&
> + (sc->gfp_mask & (__GFP_IO|__GFP_FS)) ==
> + (__GFP_IO|__GFP_FS)) {
> + /*
> + * Do not add to the lock contention if this zone has
> + * enough processes doing page reclaim already, since
> + * we would just make things slower.
> + */
> + sleep_on(&zone->reclaim_wait);
> +
> + /*
> + * If other processes freed enough memory while we waited,
> + * break out of the loop and go back to the allocator.
> + */
> + if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> + 0, 0)) {
> + wake_up(&zone->reclaim_wait);
> + sc->nr_reclaimed += nr_to_reclaim;
> + return;
> + }
> + }
> +
> + atomic_inc(&zone->concurrent_reclaimers);
> +
> /* If we have no swap space, do not bother scanning anon pages. */
> if (!sc->may_swap || (nr_swap_pages <= 0)) {
> noswap = 1;
> @@ -1655,6 +1692,9 @@ static void shrink_zone(int priority, struct zone *zone,
> shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
>
> throttle_vm_writeout(sc->gfp_mask);
> +
> + atomic_dec(&zone->concurrent_reclaimers);
> + wake_up(&zone->reclaim_wait);
> }
>
> /*
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Mon, 14 Dec 2009 21:23:46 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> concurrent_reclaimers limitation related code made mess to shrink_zone.
> To introduce helper function increase readability.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
Reviewed-by: Minchan Kim <[email protected]>
Through looking over patch series, it's nice clean up.
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:24:40 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
>
>
> sleep_on() function is SMP and/or kernel preemption unsafe. we shouldn't
> use it on new code.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
We would be better to remove this function.
But it's enough to that in this patch series.
We have to remove sleep_on with another patch series.
Reviewed-by: Minchan Kim <[email protected]>
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:29:28 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> sleep_on() is SMP and/or kernel preemption unsafe. This patch
> replace it with safe code.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
Reviwed-by: Minchan Kim <[email protected]>
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:30:19 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> if we don't use exclusive queue, wake_up() function wake _all_ waited
> task. This is simply cpu wasting.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/vmscan.c | 4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index e0cb834..3562a2d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1618,7 +1618,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
> * we would just make things slower.
> */
> for (;;) {
> - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
> + prepare_to_wait_exclusive(wq, &wait, TASK_UNINTERRUPTIBLE);
>
> if (atomic_read(&zone->concurrent_reclaimers) <=
> max_zone_concurrent_reclaimers)
> @@ -1632,7 +1632,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
> */
> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> 0, 0)) {
> - wake_up(wq);
> + wake_up_all(wq);
I think it's typo. The description in changelog says we want "wake_up".
Otherwise, looks good to me.
Reviewed-by: Minchan Kim <[email protected]>
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:30:54 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> All task sleeping point in vmscan (e.g. congestion_wait) use
> io_schedule. then shrink_zone_begin use it too.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/vmscan.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 3562a2d..0880668 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1624,7 +1624,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
> max_zone_concurrent_reclaimers)
> break;
>
> - schedule();
> + io_schedule();
Hmm. We have many cond_resched which is not io_schedule in vmscan.c.
In addition, if system doesn't have swap device space and out of page cache
due to heavy memory pressue, VM might scan & drop pages until priority is zero
or zone is unreclaimable.
I think it would be not a IO wait.
>
> /*
> * If other processes freed enough memory while we waited,
> --
> 1.6.5.2
>
>
>
--
Kind regards,
Minchan Kim
> On 12/14/2009 07:31 AM, KOSAKI Motohiro wrote:
> >
> > From latency view, There isn't any reason shrink_zones() continue to
> > reclaim another zone's page if the task reclaimed enough lots pages.
>
> IIRC there is one reason - keeping equal pageout pressure
> between zones.
>
> However, it may be enough if just kswapd keeps evening out
> the pressure, now that we limit the number of concurrent
> direct reclaimers in the system.
>
> Since kswapd does not use shrink_zones ...
Sure. That's exactly my point.
plus, balance_pgdat() scan only one node. then zone balancing is
meaingfull. but shrink_zones() scan all zone in all node. we don't
need inter node balancing. it's vmscan's buisiness.
> > Signed-off-by: KOSAKI Motohiro<[email protected]>
>
> Reviewed-by: Rik van Riel <[email protected]>
Thanks.
On Mon, 14 Dec 2009 21:32:18 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> When fork bomb invoke OOM Killer, almost task might start to reclaim and
> sleep on shrink_zone_begin(). if we use TASK_UNINTERRUPTIBLE, OOM killer
> can't kill such task. it mean we never recover from fork bomb.
>
> This patch fixes it.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
I doubt wake_up_all, too. I think it's typo.
Otherwise, Looks good to me.
Reviewed-by: Minchan Kim <[email protected]>
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:32:58 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> In OOM case, almost processes may be in vmscan. There isn't any reason
> the killed process continue allocation. process exiting free lots pages
> rather than greedy vmscan.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/page_alloc.c | 8 ++++++++
> 1 files changed, 8 insertions(+), 0 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index ca9cae1..8a9cbaa 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1878,6 +1878,14 @@ rebalance:
> goto got_pg;
>
> /*
> + * If the allocation is for userland page and we have fatal signal,
> + * there isn't any reason to continue allocation. instead, the task
> + * should exit soon.
> + */
> + if (fatal_signal_pending(current) && (gfp_mask & __GFP_HIGHMEM))
> + goto nopage;
If we jump nopage, we meets dump_stack and show_mem.
Even, we can meet OOM which might kill innocent process.
> +
> + /*
> * If we failed to make any progress reclaiming, then we are
> * running out of options and have to consider going OOM
> */
> --
> 1.6.5.2
>
>
>
--
Kind regards,
Minchan Kim
On Mon, 14 Dec 2009 21:31:36 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
>
> From latency view, There isn't any reason shrink_zones() continue to
> reclaim another zone's page if the task reclaimed enough lots pages.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/vmscan.c | 16 ++++++++++++----
> 1 files changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 0880668..bf229d3 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1654,7 +1654,7 @@ static void shrink_zone_end(struct zone *zone, struct scan_control *sc)
> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static void shrink_zone(int priority, struct zone *zone,
> +static int shrink_zone(int priority, struct zone *zone,
> struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> @@ -1669,7 +1669,7 @@ static void shrink_zone(int priority, struct zone *zone,
>
> ret = shrink_zone_begin(zone, sc);
> if (ret)
> - return;
> + return ret;
>
> /* If we have no swap space, do not bother scanning anon pages. */
> if (!sc->may_swap || (nr_swap_pages <= 0)) {
> @@ -1692,6 +1692,7 @@ static void shrink_zone(int priority, struct zone *zone,
> &reclaim_stat->nr_saved_scan[l]);
> }
>
> + ret = 0;
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> @@ -1712,8 +1713,10 @@ static void shrink_zone(int priority, struct zone *zone,
> * with multiple processes reclaiming pages, the total
> * freeing target can get unreasonably large.
> */
> - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
> + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) {
> + ret = -ERESTARTSYS;
Just nitpick.
shrink_zone's return value is matter?
shrink_zones never handle that.
> break;
> + }
> }
>
> sc->nr_reclaimed = nr_reclaimed;
> @@ -1727,6 +1730,8 @@ static void shrink_zone(int priority, struct zone *zone,
>
> throttle_vm_writeout(sc->gfp_mask);
> shrink_zone_end(zone, sc);
> +
> + return ret;
> }
>
> /*
> @@ -1751,6 +1756,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
> enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
> struct zoneref *z;
> struct zone *zone;
> + int ret;
>
> sc->all_unreclaimable = 1;
> for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
> @@ -1780,7 +1786,9 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
> priority);
> }
>
> - shrink_zone(priority, zone, sc);
> + ret = shrink_zone(priority, zone, sc);
> + if (ret)
> + return;
> }
> }
> --
> 1.6.5.2
>
>
>
As a matter of fact, I am worried about this patch.
My opinion is we put aside this patch until we can solve Larry's problem.
We could apply this patch in future.
I don't want to see the side effect while we focus Larry's problem.
But If you mind my suggestion, I also will not bother you by this nitpick.
Thanks for great cleanup and improving VM, Kosaki. :)
--
Kind regards,
Minchan Kim
> On Mon, 14 Dec 2009 21:31:36 +0900 (JST)
> KOSAKI Motohiro <[email protected]> wrote:
>
> >
> > From latency view, There isn't any reason shrink_zones() continue to
> > reclaim another zone's page if the task reclaimed enough lots pages.
> >
> > Signed-off-by: KOSAKI Motohiro <[email protected]>
> > ---
> > mm/vmscan.c | 16 ++++++++++++----
> > 1 files changed, 12 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 0880668..bf229d3 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1654,7 +1654,7 @@ static void shrink_zone_end(struct zone *zone, struct scan_control *sc)
> > /*
> > * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> > */
> > -static void shrink_zone(int priority, struct zone *zone,
> > +static int shrink_zone(int priority, struct zone *zone,
> > struct scan_control *sc)
> > {
> > unsigned long nr[NR_LRU_LISTS];
> > @@ -1669,7 +1669,7 @@ static void shrink_zone(int priority, struct zone *zone,
> >
> > ret = shrink_zone_begin(zone, sc);
> > if (ret)
> > - return;
> > + return ret;
> >
> > /* If we have no swap space, do not bother scanning anon pages. */
> > if (!sc->may_swap || (nr_swap_pages <= 0)) {
> > @@ -1692,6 +1692,7 @@ static void shrink_zone(int priority, struct zone *zone,
> > &reclaim_stat->nr_saved_scan[l]);
> > }
> >
> > + ret = 0;
> > while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> > nr[LRU_INACTIVE_FILE]) {
> > for_each_evictable_lru(l) {
> > @@ -1712,8 +1713,10 @@ static void shrink_zone(int priority, struct zone *zone,
> > * with multiple processes reclaiming pages, the total
> > * freeing target can get unreasonably large.
> > */
> > - if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
> > + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) {
> > + ret = -ERESTARTSYS;
>
> Just nitpick.
>
> shrink_zone's return value is matter?
> shrink_zones never handle that.
shrink_zones() stop vmscan quickly if ret isn't !0.
if we already scanned rather than nr_to_reclaim, we can stop vmscan.
> As a matter of fact, I am worried about this patch.
>
> My opinion is we put aside this patch until we can solve Larry's problem.
> We could apply this patch in future.
>
> I don't want to see the side effect while we focus Larry's problem.
> But If you mind my suggestion, I also will not bother you by this nitpick.
>
> Thanks for great cleanup and improving VM, Kosaki. :)
I agree with Larry's issue is highest priority.
> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> > if we don't use exclusive queue, wake_up() function wake _all_ waited
> > task. This is simply cpu wasting.
> >
> > Signed-off-by: KOSAKI Motohiro<[email protected]>
>
> > if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> > 0, 0)) {
> > - wake_up(wq);
> > + wake_up_all(wq);
> > finish_wait(wq,&wait);
> > sc->nr_reclaimed += sc->nr_to_reclaim;
> > return -ERESTARTSYS;
>
> I believe we want to wake the processes up one at a time
> here. If the queue of waiting processes is very large
> and the amount of excess free memory is fairly low, the
> first processes that wake up can take the amount of free
> memory back down below the threshold. The rest of the
> waiters should stay asleep when this happens.
OK.
Actually, wake_up() and wake_up_all() aren't different so much.
Although we use wake_up(), the task wake up next task before
try to alloate memory. then, it's similar to wake_up_all().
However, there are few difference. recent scheduler latency improvement
effort reduce default scheduler latency target. it mean, if we have
lots tasks of running state, the task have very few time slice. too
frequently context switch decrease VM efficiency.
Thank you, Rik. I didn't notice wake_up() makes better performance than
wake_up_all() on current kernel.
Subject: [PATCH 9/8] replace wake_up_all with wake_up
Fix typo.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5adb7a..b3b4e77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1644,7 +1644,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
return 0;
found_lots_memory:
- wake_up_all(wq);
+ wake_up(wq);
stop_reclaim:
finish_wait(wq, &wait);
sc->nr_reclaimed += sc->nr_to_reclaim;
--
1.6.5.2
> On Fri, 2009-12-11 at 16:46 -0500, Rik van Riel wrote:
>
> Rik, the latest patch appears to have a problem although I dont know
> what the problem is yet. When the system ran out of memory we see
> thousands of runnable processes and 100% system time:
>
>
> 9420 2 29824 79856 62676 19564 0 0 0 0 8054 379 0
> 100 0 0 0
> 9420 2 29824 79368 62292 19564 0 0 0 0 8691 413 0
> 100 0 0 0
> 9421 1 29824 79780 61780 19820 0 0 0 0 8928 408 0
> 100 0 0 0
>
> The system would not respond so I dont know whats going on yet. I'll
> add debug code to figure out why its in that state as soon as I get
> access to the hardware.
>
> Larry
There are 9421 running processces. it mean concurrent task limitation
don't works well. hmm?
> > /*
> > + * If the allocation is for userland page and we have fatal signal,
> > + * there isn't any reason to continue allocation. instead, the task
> > + * should exit soon.
> > + */
> > + if (fatal_signal_pending(current) && (gfp_mask & __GFP_HIGHMEM))
> > + goto nopage;
>
> If we jump nopage, we meets dump_stack and show_mem.
> Even, we can meet OOM which might kill innocent process.
Which point you oppose? noprint is better?
> On Mon, 14 Dec 2009 21:30:54 +0900 (JST)
> KOSAKI Motohiro <[email protected]> wrote:
>
> > All task sleeping point in vmscan (e.g. congestion_wait) use
> > io_schedule. then shrink_zone_begin use it too.
> >
> > Signed-off-by: KOSAKI Motohiro <[email protected]>
> > ---
> > mm/vmscan.c | 2 +-
> > 1 files changed, 1 insertions(+), 1 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 3562a2d..0880668 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1624,7 +1624,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
> > max_zone_concurrent_reclaimers)
> > break;
> >
> > - schedule();
> > + io_schedule();
>
> Hmm. We have many cond_resched which is not io_schedule in vmscan.c.
cond_resched don't mean sleep on wait queue. it's similar to yield.
> In addition, if system doesn't have swap device space and out of page cache
> due to heavy memory pressue, VM might scan & drop pages until priority is zero
> or zone is unreclaimable.
>
> I think it would be not a IO wait.
two point.
1. For long time, Administrator usually watch iowait% at heavy memory pressure. I
don't hope change this without reasonable reason. 2. iowait makes scheduler
bonus a bit, vmscan task should have many time slice than memory consume
task. it improve VM stabilization.
but I agree the benefit isn't so big. if we have reasonable reason, I
don't oppose use schedule().
On Tue, 15 Dec 2009 09:50:47 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:
> > > /*
> > > + * If the allocation is for userland page and we have fatal signal,
> > > + * there isn't any reason to continue allocation. instead, the task
> > > + * should exit soon.
> > > + */
> > > + if (fatal_signal_pending(current) && (gfp_mask & __GFP_HIGHMEM))
> > > + goto nopage;
> >
> > If we jump nopage, we meets dump_stack and show_mem.
> > Even, we can meet OOM which might kill innocent process.
>
> Which point you oppose? noprint is better?
>
>
Sorry fot not clarity.
My point was following as.
First,
I don't want to print.
Why do we print stack and mem when the process receives the SIGKILL?
Second,
1) A process try to allocate anon page in do_anonymous_page.
2) A process receives SIGKILL.
3) kernel doesn't allocate page to A process by your patch.
4) do_anonymous_page returns VF_FAULT_OOM.
5) call mm_fault_error
6) call out_of_memory
7) It migth kill innocent task.
If I missed something, Pz, corret me. :)
--
Kind regards,
Minchan Kim
On Tue, Dec 15, 2009 at 9:56 AM, KOSAKI Motohiro
<[email protected]> wrote:
>> On Mon, 14 Dec 2009 21:30:54 +0900 (JST)
>> KOSAKI Motohiro <[email protected]> wrote:
>>
>> > All task sleeping point in vmscan (e.g. congestion_wait) use
>> > io_schedule. then shrink_zone_begin use it too.
>> >
>> > Signed-off-by: KOSAKI Motohiro <[email protected]>
>> > ---
>> > mm/vmscan.c | 2 +-
>> > 1 files changed, 1 insertions(+), 1 deletions(-)
>> >
>> > diff --git a/mm/vmscan.c b/mm/vmscan.c
>> > index 3562a2d..0880668 100644
>> > --- a/mm/vmscan.c
>> > +++ b/mm/vmscan.c
>> > @@ -1624,7 +1624,7 @@ static int shrink_zone_begin(struct zone *zone, struct scan_control *sc)
>> > max_zone_concurrent_reclaimers)
>> > break;
>> >
>> > - schedule();
>> > + io_schedule();
>>
>> Hmm. We have many cond_resched which is not io_schedule in vmscan.c.
>
> cond_resched don't mean sleep on wait queue. it's similar to yield.
I confused it.
Thanks for correcting me. :)
>
>> In addition, if system doesn't have swap device space and out of page cache
>> due to heavy memory pressue, VM might scan & drop pages until priority is zero
>> or zone is unreclaimable.
>>
>> I think it would be not a IO wait.
>
> two point.
> 1. For long time, Administrator usually watch iowait% at heavy memory pressure. I
> don't hope change this without reasonable reason. 2. iowait makes scheduler
> bonus a bit, vmscan task should have many time slice than memory consume
> task. it improve VM stabilization.
AFAIK, CFS scheduler doesn't give the bonus to I/O wait task any more.
>
> but I agree the benefit isn't so big. if we have reasonable reason, I
> don't oppose use schedule().
>
>
>
>
--
Kind regards,
Minchan Kim
> On Tue, 15 Dec 2009 09:50:47 +0900 (JST)
> KOSAKI Motohiro <[email protected]> wrote:
>
> > > > /*
> > > > + * If the allocation is for userland page and we have fatal signal,
> > > > + * there isn't any reason to continue allocation. instead, the task
> > > > + * should exit soon.
> > > > + */
> > > > + if (fatal_signal_pending(current) && (gfp_mask & __GFP_HIGHMEM))
> > > > + goto nopage;
> > >
> > > If we jump nopage, we meets dump_stack and show_mem.
> > > Even, we can meet OOM which might kill innocent process.
> >
> > Which point you oppose? noprint is better?
> >
> >
>
> Sorry fot not clarity.
> My point was following as.
>
> First,
> I don't want to print.
> Why do we print stack and mem when the process receives the SIGKILL?
>
> Second,
> 1) A process try to allocate anon page in do_anonymous_page.
> 2) A process receives SIGKILL.
> 3) kernel doesn't allocate page to A process by your patch.
> 4) do_anonymous_page returns VF_FAULT_OOM.
> 5) call mm_fault_error
> 6) call out_of_memory
> 7) It migth kill innocent task.
>
> If I missed something, Pz, corret me. :)
Doh, you are complely right. I had forgot recent meaning change of VM_FAULT_OOM.
yes, this patch is crap. I need to remake it.
On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> > On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> > > if we don't use exclusive queue, wake_up() function wake _all_ waited
> > > task. This is simply cpu wasting.
> > >
> > > Signed-off-by: KOSAKI Motohiro<[email protected]>
> >
> > > if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> > > 0, 0)) {
> > > - wake_up(wq);
> > > + wake_up_all(wq);
> > > finish_wait(wq,&wait);
> > > sc->nr_reclaimed += sc->nr_to_reclaim;
> > > return -ERESTARTSYS;
> >
> > I believe we want to wake the processes up one at a time
> > here. If the queue of waiting processes is very large
> > and the amount of excess free memory is fairly low, the
> > first processes that wake up can take the amount of free
> > memory back down below the threshold. The rest of the
> > waiters should stay asleep when this happens.
>
> OK.
>
> Actually, wake_up() and wake_up_all() aren't different so much.
> Although we use wake_up(), the task wake up next task before
> try to alloate memory. then, it's similar to wake_up_all().
What happens to waiters should running tasks not allocate for a while?
> However, there are few difference. recent scheduler latency improvement
> effort reduce default scheduler latency target. it mean, if we have
> lots tasks of running state, the task have very few time slice. too
> frequently context switch decrease VM efficiency.
> Thank you, Rik. I didn't notice wake_up() makes better performance than
> wake_up_all() on current kernel.
Perhaps this is a spot where an explicit wake_up_all_nopreempt() would
be handy. Excessive wakeup preemption from wake_up_all() has long been
annoying when there are many waiters, but converting it to only have the
first wakeup be preemptive proved harmful to performance. Recent tweaks
will have aggravated the problem somewhat, but it's not new.
-Mike
On Tue, 2009-12-15 at 06:32 +0100, Mike Galbraith wrote:
> On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> > > On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> > > > if we don't use exclusive queue, wake_up() function wake _all_ waited
> > > > task. This is simply cpu wasting.
> > > >
> > > > Signed-off-by: KOSAKI Motohiro<[email protected]>
> > >
> > > > if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> > > > 0, 0)) {
> > > > - wake_up(wq);
> > > > + wake_up_all(wq);
> > > > finish_wait(wq,&wait);
> > > > sc->nr_reclaimed += sc->nr_to_reclaim;
> > > > return -ERESTARTSYS;
> > >
> > > I believe we want to wake the processes up one at a time
> > > here. If the queue of waiting processes is very large
> > > and the amount of excess free memory is fairly low, the
> > > first processes that wake up can take the amount of free
> > > memory back down below the threshold. The rest of the
> > > waiters should stay asleep when this happens.
> >
> > OK.
> >
> > Actually, wake_up() and wake_up_all() aren't different so much.
> > Although we use wake_up(), the task wake up next task before
> > try to alloate memory. then, it's similar to wake_up_all().
>
> What happens to waiters should running tasks not allocate for a while?
>
> > However, there are few difference. recent scheduler latency improvement
> > effort reduce default scheduler latency target. it mean, if we have
> > lots tasks of running state, the task have very few time slice. too
> > frequently context switch decrease VM efficiency.
> > Thank you, Rik. I didn't notice wake_up() makes better performance than
> > wake_up_all() on current kernel.
>
> Perhaps this is a spot where an explicit wake_up_all_nopreempt() would
> be handy....
Maybe something like below. I can also imagine that under _heavy_ vm
pressure, it'd likely be good for throughput to not provide for sleeper
fairness for these wakeups as well, as that increases vruntime spread,
and thus increases preemption with no benefit in sight.
---
include/linux/sched.h | 1 +
include/linux/wait.h | 3 +++
kernel/sched.c | 21 +++++++++++++++++++++
kernel/sched_fair.c | 2 +-
4 files changed, 26 insertions(+), 1 deletion(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1065,6 +1065,7 @@ struct sched_domain;
*/
#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_NOPREEMPT 0x04 /* wakeup is not preemptive */
struct sched_class {
const struct sched_class *next;
Index: linux-2.6/include/linux/wait.h
===================================================================
--- linux-2.6.orig/include/linux/wait.h
+++ linux-2.6/include/linux/wait.h
@@ -140,6 +140,7 @@ static inline void __remove_wait_queue(w
}
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+void __wake_up_nopreempt(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
void *key);
@@ -154,8 +155,10 @@ int out_of_line_wait_on_bit_lock(void *,
wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
+#define wake_up_nopreempt(x) __wake_up_nopreempt(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
+#define wake_up_all_nopreempt(x) __wake_up_nopreempt(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -5682,6 +5682,27 @@ void __wake_up(wait_queue_head_t *q, uns
}
EXPORT_SYMBOL(__wake_up);
+/**
+ * __wake_up_nopreempt - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_nopreempt(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, WF_NOPREEMPT, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up_nopreempt);
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1709,7 +1709,7 @@ static void check_preempt_wakeup(struct
pse->avg_overlap < sysctl_sched_migration_cost)
goto preempt;
- if (!sched_feat(WAKEUP_PREEMPT))
+ if (!sched_feat(WAKEUP_PREEMPT) || (wake_flags & WF_NOPREEMPT))
return;
update_curr(cfs_rq);
On Tue, 2009-12-15 at 09:29 +0100, Mike Galbraith wrote:
> On Tue, 2009-12-15 at 06:32 +0100, Mike Galbraith wrote:
> > On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> > > > On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> > > > > if we don't use exclusive queue, wake_up() function wake _all_ waited
> > > > > task. This is simply cpu wasting.
> > > > >
> > > > > Signed-off-by: KOSAKI Motohiro<[email protected]>
> > > >
> > > > > if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> > > > > 0, 0)) {
> > > > > - wake_up(wq);
> > > > > + wake_up_all(wq);
> > > > > finish_wait(wq,&wait);
> > > > > sc->nr_reclaimed += sc->nr_to_reclaim;
> > > > > return -ERESTARTSYS;
> > > >
> > > > I believe we want to wake the processes up one at a time
> > > > here. If the queue of waiting processes is very large
> > > > and the amount of excess free memory is fairly low, the
> > > > first processes that wake up can take the amount of free
> > > > memory back down below the threshold. The rest of the
> > > > waiters should stay asleep when this happens.
> > >
> > > OK.
> > >
> > > Actually, wake_up() and wake_up_all() aren't different so much.
> > > Although we use wake_up(), the task wake up next task before
> > > try to alloate memory. then, it's similar to wake_up_all().
> >
> > What happens to waiters should running tasks not allocate for a while?
> >
> > > However, there are few difference. recent scheduler latency improvement
> > > effort reduce default scheduler latency target. it mean, if we have
> > > lots tasks of running state, the task have very few time slice. too
> > > frequently context switch decrease VM efficiency.
> > > Thank you, Rik. I didn't notice wake_up() makes better performance than
> > > wake_up_all() on current kernel.
> >
> > Perhaps this is a spot where an explicit wake_up_all_nopreempt() would
> > be handy....
>
> Maybe something like below. I can also imagine that under _heavy_ vm
> pressure, it'd likely be good for throughput to not provide for sleeper
> fairness for these wakeups as well, as that increases vruntime spread,
> and thus increases preemption with no benefit in sight.
Copy/pasting some methods, and hardcoding futexes, where I know vmark
loads to the point of ridiculous on my little box, it's good for ~17%
throughput boost. Used prudently, something along these lines could
save some thrashing when core code knows it's handling a surge. It
would have a very negative effect at low to modest load though.
Hohum.
---
include/linux/completion.h | 2
include/linux/sched.h | 10 ++-
include/linux/wait.h | 3
kernel/sched.c | 140 ++++++++++++++++++++++++++++++++++++---------
kernel/sched_fair.c | 32 +++++-----
kernel/sched_idletask.c | 2
kernel/sched_rt.c | 6 -
7 files changed, 146 insertions(+), 49 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1065,12 +1065,16 @@ struct sched_domain;
*/
#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_BATCH 0x04 /* batch wakeup, not preemptive */
+#define WF_REQUEUE 0x00 /* task requeue */
+#define WF_WAKE 0x10 /* task waking */
+#define WF_SLEEP 0x20 /* task going to sleep */
struct sched_class {
const struct sched_class *next;
- void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -2028,6 +2032,8 @@ extern void do_timer(unsigned long ticks
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
+extern int wake_up_state_batch(struct task_struct *tsk, unsigned int state);
+extern int wake_up_process_batch(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk,
unsigned long clone_flags);
#ifdef CONFIG_SMP
Index: linux-2.6/include/linux/wait.h
===================================================================
--- linux-2.6.orig/include/linux/wait.h
+++ linux-2.6/include/linux/wait.h
@@ -140,6 +140,7 @@ static inline void __remove_wait_queue(w
}
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
+void __wake_up_batch(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
void *key);
@@ -154,8 +155,10 @@ int out_of_line_wait_on_bit_lock(void *,
wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
+#define wake_up_batch(x) __wake_up_batch(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
+#define wake_up_all_batch(x) __wake_up_batch(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1392,7 +1392,7 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
+static void activate_task(struct rq *rq, struct task_struct *p, int flags);
/*
* runqueue iterator, to support SMP load-balancing between different
@@ -1962,24 +1962,24 @@ static int effective_prio(struct task_st
/*
* activate_task - move a task to the runqueue.
*/
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
- enqueue_task(rq, p, wakeup);
+ enqueue_task(rq, p, flags);
inc_nr_running(rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
- dequeue_task(rq, p, sleep);
+ dequeue_task(rq, p, flags);
dec_nr_running(rq);
}
@@ -2415,7 +2415,7 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
- activate_task(rq, p, 1);
+ activate_task(rq, p, wake_flags);
success = 1;
/*
@@ -2474,13 +2474,35 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ return try_to_wake_up(p, TASK_ALL, WF_WAKE);
}
EXPORT_SYMBOL(wake_up_process);
+/**
+ * wake_up_process_batch - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes. Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process_batch(struct task_struct *p)
+{
+ return try_to_wake_up(p, TASK_ALL, WF_WAKE|WF_BATCH);
+}
+EXPORT_SYMBOL(wake_up_process_batch);
+
int wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up(p, state, WF_WAKE);
+}
+
+int wake_up_state_batch(struct task_struct *p, unsigned int state)
+{
+ return try_to_wake_up(p, state, WF_WAKE|WF_BATCH);
}
/*
@@ -2628,7 +2650,7 @@ void wake_up_new_task(struct task_struct
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
update_rq_clock(rq);
- activate_task(rq, p, 0);
+ activate_task(rq, p, WF_WAKE|WF_FORK);
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -3156,9 +3178,9 @@ void sched_exec(void)
static void pull_task(struct rq *src_rq, struct task_struct *p,
struct rq *this_rq, int this_cpu)
{
- deactivate_task(src_rq, p, 0);
+ deactivate_task(src_rq, p, WF_REQUEUE);
set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ activate_task(this_rq, p, WF_REQUEUE);
check_preempt_curr(this_rq, p, 0);
}
@@ -5468,7 +5490,7 @@ need_resched_nonpreemptible:
if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
else
- deactivate_task(rq, prev, 1);
+ deactivate_task(rq, prev, WF_SLEEP);
switch_count = &prev->nvcsw;
}
@@ -5634,7 +5656,7 @@ asmlinkage void __sched preempt_schedule
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
- return try_to_wake_up(curr->private, mode, wake_flags);
+ return try_to_wake_up(curr->private, mode, wake_flags|WF_WAKE);
}
EXPORT_SYMBOL(default_wake_function);
@@ -5677,22 +5699,43 @@ void __wake_up(wait_queue_head_t *q, uns
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
+ __wake_up_common(q, mode, nr_exclusive, WF_WAKE, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(__wake_up);
+/**
+ * __wake_up_batch - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_batch(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, void *key)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->lock, flags);
+ __wake_up_common(q, mode, nr_exclusive, WF_WAKE|WF_BATCH, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up_batch);
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
{
- __wake_up_common(q, mode, 1, 0, NULL);
+ __wake_up_common(q, mode, 1, WF_WAKE, NULL);
}
void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, 1, WF_WAKE, key);
}
/**
@@ -5716,7 +5759,7 @@ void __wake_up_sync_key(wait_queue_head_
int nr_exclusive, void *key)
{
unsigned long flags;
- int wake_flags = WF_SYNC;
+ int wake_flags = WF_WAKE|WF_SYNC;
if (unlikely(!q))
return;
@@ -5757,12 +5800,35 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, WF_WAKE, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
/**
+ * complete_batch: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_batch(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done++;
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, WF_WAKE|WF_BATCH, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_batch);
+
+/**
* complete_all: - signals all threads waiting on this completion
* @x: holds the state of this particular completion
*
@@ -5777,11 +5843,31 @@ void complete_all(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, WF_WAKE, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
+/**
+ * complete_all_batch: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all_batch(struct completion *x)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&x->wait.lock, flags);
+ x->done += UINT_MAX/2;
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, WF_WAKE|WF_BATCH, NULL);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all_batch);
+
static inline long __sched
do_wait_for_common(struct completion *x, long timeout, int state)
{
@@ -6344,7 +6430,7 @@ recheck:
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
- deactivate_task(rq, p, 0);
+ deactivate_task(rq, p, WF_REQUEUE);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -6356,7 +6442,7 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq) {
- activate_task(rq, p, 0);
+ activate_task(rq, p, WF_REQUEUE);
check_class_changed(rq, p, prev_class, oldprio, running);
}
@@ -7172,11 +7258,11 @@ static int __migrate_task(struct task_st
on_rq = p->se.on_rq;
if (on_rq)
- deactivate_task(rq_src, p, 0);
+ deactivate_task(rq_src, p, WF_REQUEUE);
set_task_cpu(p, dest_cpu);
if (on_rq) {
- activate_task(rq_dest, p, 0);
+ activate_task(rq_dest, p, WF_REQUEUE);
check_preempt_curr(rq_dest, p, 0);
}
done:
@@ -7368,7 +7454,7 @@ void sched_idle_next(void)
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
update_rq_clock(rq);
- activate_task(rq, p, 0);
+ activate_task(rq, p, WF_REQUEUE);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -7707,7 +7793,7 @@ migration_call(struct notifier_block *nf
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- deactivate_task(rq, rq->idle, 0);
+ deactivate_task(rq, rq->idle, WF_REQUEUE);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
@@ -9698,10 +9784,10 @@ static void normalize_task(struct rq *rq
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
- deactivate_task(rq, p, 0);
+ deactivate_task(rq, p, WF_REQUEUE);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(rq, p, 0);
+ activate_task(rq, p, WF_REQUEUE);
resched_task(rq->curr);
}
}
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -722,7 +722,7 @@ static void check_spread(struct cfs_rq *
}
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vruntime = cfs_rq->min_vruntime;
@@ -732,11 +732,11 @@ place_entity(struct cfs_rq *cfs_rq, stru
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
- if (initial && sched_feat(START_DEBIT))
+ if (flags & WF_FORK && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!(flags & (WF_FORK|WF_BATCH)) && sched_feat(FAIR_SLEEPERS)) {
unsigned long thresh = sysctl_sched_latency;
/*
@@ -766,7 +766,7 @@ place_entity(struct cfs_rq *cfs_rq, stru
}
static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
@@ -774,8 +774,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
update_curr(cfs_rq);
account_entity_enqueue(cfs_rq, se);
- if (wakeup) {
- place_entity(cfs_rq, se, 0);
+ if (flags & WF_WAKE) {
+ place_entity(cfs_rq, se, flags);
enqueue_sleeper(cfs_rq, se);
}
@@ -801,7 +801,7 @@ static void clear_buddies(struct cfs_rq
}
static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
@@ -809,7 +809,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
- if (sleep) {
+ if (flags & WF_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -1034,7 +1034,7 @@ static inline void hrtick_update(struct
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
@@ -1043,8 +1043,8 @@ static void enqueue_task_fair(struct rq
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, wakeup);
- wakeup = 1;
+ enqueue_entity(cfs_rq, se, flags);
+ flags |= WF_WAKE;
}
hrtick_update(rq);
@@ -1055,18 +1055,18 @@ static void enqueue_task_fair(struct rq
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, sleep);
+ dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
- sleep = 1;
+ flags |= WF_SLEEP;
}
hrtick_update(rq);
@@ -1709,7 +1709,7 @@ static void check_preempt_wakeup(struct
pse->avg_overlap < sysctl_sched_migration_cost)
goto preempt;
- if (!sched_feat(WAKEUP_PREEMPT))
+ if (!sched_feat(WAKEUP_PREEMPT) || (wake_flags & WF_BATCH))
return;
update_curr(cfs_rq);
@@ -1964,7 +1964,7 @@ static void task_fork_fair(struct task_s
if (curr)
se->vruntime = curr->vruntime;
- place_entity(cfs_rq, se, 1);
+ place_entity(cfs_rq, se, WF_FORK);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
/*
Index: linux-2.6/include/linux/completion.h
===================================================================
--- linux-2.6.orig/include/linux/completion.h
+++ linux-2.6/include/linux/completion.h
@@ -88,6 +88,8 @@ extern bool completion_done(struct compl
extern void complete(struct completion *);
extern void complete_all(struct completion *);
+extern void complete_batch(struct completion *);
+extern void complete_all_batch(struct completion *);
/**
* INIT_COMPLETION: - reinitialize a completion structure
Index: linux-2.6/kernel/sched_rt.c
===================================================================
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -878,11 +878,11 @@ static void dequeue_rt_entity(struct sch
/*
* Adding/removing a task to/from a priority array:
*/
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
- if (wakeup)
+ if (flags & WF_WAKE)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se);
@@ -891,7 +891,7 @@ static void enqueue_task_rt(struct rq *r
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
Index: linux-2.6/kernel/sched_idletask.c
===================================================================
--- linux-2.6.orig/kernel/sched_idletask.c
+++ linux-2.6/kernel/sched_idletask.c
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_tas
* message if some code attempts to do it:
*/
static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
pr_err("bad: scheduling from the idle thread!\n");
On 12/15/2009 12:32 AM, Mike Galbraith wrote:
> On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
>>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
>>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
>>>> task. This is simply cpu wasting.
>>>>
>>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
>>>
>>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
>>>> 0, 0)) {
>>>> - wake_up(wq);
>>>> + wake_up_all(wq);
>>>> finish_wait(wq,&wait);
>>>> sc->nr_reclaimed += sc->nr_to_reclaim;
>>>> return -ERESTARTSYS;
>>>
>>> I believe we want to wake the processes up one at a time
>>> here.
>> Actually, wake_up() and wake_up_all() aren't different so much.
>> Although we use wake_up(), the task wake up next task before
>> try to alloate memory. then, it's similar to wake_up_all().
That is a good point. Maybe processes need to wait a little
in this if() condition, before the wake_up(). That would give
the previous process a chance to allocate memory and we can
avoid waking up too many processes.
> What happens to waiters should running tasks not allocate for a while?
When a waiter is woken up, it will either:
1) see that there is enough free memory and wake up the next guy, or
2) run shrink_zone and wake up the next guy
Either way, the processes that just got woken up will ensure that
the sleepers behind them in the queue will get woken up.
--
All rights reversed.
On Tue, 2009-12-15 at 09:58 -0500, Rik van Riel wrote:
> On 12/15/2009 12:32 AM, Mike Galbraith wrote:
> > On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> >>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> >>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
> >>>> task. This is simply cpu wasting.
> >>>>
> >>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
> >>>
> >>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> >>>> 0, 0)) {
> >>>> - wake_up(wq);
> >>>> + wake_up_all(wq);
> >>>> finish_wait(wq,&wait);
> >>>> sc->nr_reclaimed += sc->nr_to_reclaim;
> >>>> return -ERESTARTSYS;
> >>>
> >>> I believe we want to wake the processes up one at a time
> >>> here.
>
> >> Actually, wake_up() and wake_up_all() aren't different so much.
> >> Although we use wake_up(), the task wake up next task before
> >> try to alloate memory. then, it's similar to wake_up_all().
>
> That is a good point. Maybe processes need to wait a little
> in this if() condition, before the wake_up(). That would give
> the previous process a chance to allocate memory and we can
> avoid waking up too many processes.
>
> > What happens to waiters should running tasks not allocate for a while?
>
> When a waiter is woken up, it will either:
> 1) see that there is enough free memory and wake up the next guy, or
> 2) run shrink_zone and wake up the next guy
>
> Either way, the processes that just got woken up will ensure that
> the sleepers behind them in the queue will get woken up.
OK, that more or less covers my worry. From the scheduler standpoint
though, you're better off turning them all loose and letting them race,
_with_ the caveat than thundering herds do indeed make thunder (reason
for patch). Turning them loose piecemeal spreads things out over time,
which prolongs surge operations, possibly much longer than necessary.
We had the same long ago with everyone waiting for kswapd to do all the
work. Sticky problem, this roll-down to inevitable wait.
-Mike
On Tue, 2009-12-15 at 09:58 -0500, Rik van Riel wrote:
> On 12/15/2009 12:32 AM, Mike Galbraith wrote:
> > On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> >>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> >>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
> >>>> task. This is simply cpu wasting.
> >>>>
> >>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
> >>>
> >>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> >>>> 0, 0)) {
> >>>> - wake_up(wq);
> >>>> + wake_up_all(wq);
> >>>> finish_wait(wq,&wait);
> >>>> sc->nr_reclaimed += sc->nr_to_reclaim;
> >>>> return -ERESTARTSYS;
> >>>
> >>> I believe we want to wake the processes up one at a time
> >>> here.
>
> >> Actually, wake_up() and wake_up_all() aren't different so much.
> >> Although we use wake_up(), the task wake up next task before
> >> try to alloate memory. then, it's similar to wake_up_all().
>
> That is a good point. Maybe processes need to wait a little
> in this if() condition, before the wake_up(). That would give
> the previous process a chance to allocate memory and we can
> avoid waking up too many processes.
Pondering, I think I'd at least wake NR_CPUS. If there's not enough to
go round, oh darn, but if there is, you have full utilization quicker.
$.02.
-Mike
On 12/15/2009 01:43 PM, Mike Galbraith wrote:
> On Tue, 2009-12-15 at 09:58 -0500, Rik van Riel wrote:
>> On 12/15/2009 12:32 AM, Mike Galbraith wrote:
>>> On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
>>>>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
>>>>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
>>>>>> task. This is simply cpu wasting.
>>>>>>
>>>>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
>>>>>
>>>>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
>>>>>> 0, 0)) {
>>>>>> - wake_up(wq);
>>>>>> + wake_up_all(wq);
>>>>>> finish_wait(wq,&wait);
>>>>>> sc->nr_reclaimed += sc->nr_to_reclaim;
>>>>>> return -ERESTARTSYS;
>>>>>
>>>>> I believe we want to wake the processes up one at a time
>>>>> here.
>>
>>>> Actually, wake_up() and wake_up_all() aren't different so much.
>>>> Although we use wake_up(), the task wake up next task before
>>>> try to alloate memory. then, it's similar to wake_up_all().
>>
>> That is a good point. Maybe processes need to wait a little
>> in this if() condition, before the wake_up(). That would give
>> the previous process a chance to allocate memory and we can
>> avoid waking up too many processes.
>
> Pondering, I think I'd at least wake NR_CPUS. If there's not enough to
> go round, oh darn, but if there is, you have full utilization quicker.
That depends on what the other CPUs in the system are doing.
If they were doing work, you've just wasted some resources.
--
All rights reversed.
> On 12/15/2009 12:32 AM, Mike Galbraith wrote:
> > On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> >>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> >>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
> >>>> task. This is simply cpu wasting.
> >>>>
> >>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
> >>>
> >>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> >>>> 0, 0)) {
> >>>> - wake_up(wq);
> >>>> + wake_up_all(wq);
> >>>> finish_wait(wq,&wait);
> >>>> sc->nr_reclaimed += sc->nr_to_reclaim;
> >>>> return -ERESTARTSYS;
> >>>
> >>> I believe we want to wake the processes up one at a time
> >>> here.
>
> >> Actually, wake_up() and wake_up_all() aren't different so much.
> >> Although we use wake_up(), the task wake up next task before
> >> try to alloate memory. then, it's similar to wake_up_all().
>
> That is a good point. Maybe processes need to wait a little
> in this if() condition, before the wake_up(). That would give
> the previous process a chance to allocate memory and we can
> avoid waking up too many processes.
if we really need wait a bit, Mike's wake_up_batch is best, I think.
It mean
- if another CPU is idle, wake up one process soon. iow, it don't
make meaningless idle.
- if another CPU is busy, woken process don't start to run awhile.
then, zone_watermark_ok() can calculate correct value.
> > What happens to waiters should running tasks not allocate for a while?
>
> When a waiter is woken up, it will either:
> 1) see that there is enough free memory and wake up the next guy, or
> 2) run shrink_zone and wake up the next guy
>
> Either way, the processes that just got woken up will ensure that
> the sleepers behind them in the queue will get woken up.
>
> --
> All rights reversed.
On 12/15/2009 07:48 PM, KOSAKI Motohiro wrote:
> if we really need wait a bit, Mike's wake_up_batch is best, I think.
> It mean
> - if another CPU is idle, wake up one process soon. iow, it don't
> make meaningless idle.
> - if another CPU is busy, woken process don't start to run awhile.
> then, zone_watermark_ok() can calculate correct value.
Agreed, that should work great.
--
All rights reversed.
On Wed, 2009-12-16 at 09:48 +0900, KOSAKI Motohiro wrote:
> > On 12/15/2009 12:32 AM, Mike Galbraith wrote:
> > > On Tue, 2009-12-15 at 09:45 +0900, KOSAKI Motohiro wrote:
> > >>> On 12/14/2009 07:30 AM, KOSAKI Motohiro wrote:
> > >>>> if we don't use exclusive queue, wake_up() function wake _all_ waited
> > >>>> task. This is simply cpu wasting.
> > >>>>
> > >>>> Signed-off-by: KOSAKI Motohiro<[email protected]>
> > >>>
> > >>>> if (zone_watermark_ok(zone, sc->order, low_wmark_pages(zone),
> > >>>> 0, 0)) {
> > >>>> - wake_up(wq);
> > >>>> + wake_up_all(wq);
> > >>>> finish_wait(wq,&wait);
> > >>>> sc->nr_reclaimed += sc->nr_to_reclaim;
> > >>>> return -ERESTARTSYS;
> > >>>
> > >>> I believe we want to wake the processes up one at a time
> > >>> here.
> >
> > >> Actually, wake_up() and wake_up_all() aren't different so much.
> > >> Although we use wake_up(), the task wake up next task before
> > >> try to alloate memory. then, it's similar to wake_up_all().
> >
> > That is a good point. Maybe processes need to wait a little
> > in this if() condition, before the wake_up(). That would give
> > the previous process a chance to allocate memory and we can
> > avoid waking up too many processes.
>
> if we really need wait a bit, Mike's wake_up_batch is best, I think.
> It mean
> - if another CPU is idle, wake up one process soon. iow, it don't
> make meaningless idle.
Along those lines, there's also NEWIDLE balancing considerations. That
idle may result in a task being pulled, which may or may not hurt a bit.
'course, if you're jamming up on memory allocation, that's the least of
your worries, but every idle avoided is potentially a pull avoided.
Just a thought.
-Mike
On 12/11/2009 11:46 PM, Rik van Riel wrote:
> Under very heavy multi-process workloads, like AIM7, the VM can
> get into trouble in a variety of ways. The trouble start when
> there are hundreds, or even thousands of processes active in the
> page reclaim code.
>
> Not only can the system suffer enormous slowdowns because of
> lock contention (and conditional reschedules) between thousands
> of processes in the page reclaim code, but each process will try
> to free up to SWAP_CLUSTER_MAX pages, even when the system already
> has lots of memory free.
>
> It should be possible to avoid both of those issues at once, by
> simply limiting how many processes are active in the page reclaim
> code simultaneously.
>
> If too many processes are active doing page reclaim in one zone,
> simply go to sleep in shrink_zone().
>
> On wakeup, check whether enough memory has been freed already
> before jumping into the page reclaim code ourselves. We want
> to use the same threshold here that is used in the page allocator
> for deciding whether or not to call the page reclaim code in the
> first place, otherwise some unlucky processes could end up freeing
> memory for the rest of the system.
>
>
>
> Control how to kill processes when uncorrected memory error (typically
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 30fe668..ed614b8 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -345,6 +345,10 @@ struct zone {
> /* Zone statistics */
> atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
>
> + /* Number of processes running page reclaim code on this zone. */
> + atomic_t concurrent_reclaimers;
> + wait_queue_head_t reclaim_wait;
> +
>
Counting semaphore?
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 12/18/2009 08:38 AM, Avi Kivity wrote:
>> + /* Number of processes running page reclaim code on this zone. */
>> + atomic_t concurrent_reclaimers;
>> + wait_queue_head_t reclaim_wait;
>> +
>
> Counting semaphore?
I don't see a safe way to adjust a counting semaphore
through /proc/sys.
--
All rights reversed.
On 12/18/2009 04:12 PM, Rik van Riel wrote:
> On 12/18/2009 08:38 AM, Avi Kivity wrote:
>
>>> + /* Number of processes running page reclaim code on this zone. */
>>> + atomic_t concurrent_reclaimers;
>>> + wait_queue_head_t reclaim_wait;
>>> +
>>
>> Counting semaphore?
>
> I don't see a safe way to adjust a counting semaphore
> through /proc/sys.
True. Maybe it's worthwhile to add such a way if this pattern has more
uses.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.