LinuxLists.cc - [PATCH 0/6] Reduce writeback from page reclaim context V6

[permalink] [raw]

Subject: [PATCH 2/6] vmscan: tracing: Update trace event to track if page reclaim IO is for anon or file pages

It is useful to distinguish between IO for anon and file pages. This
patch updates
vmscan-tracing-add-trace-event-when-a-page-is-written.patch to include
that information. The patches can be merged together.

Signed-off-by: Mel Gorman <[email protected]>
---
include/trace/events/vmscan.h | 30 ++++++++++++++++++++++++------
mm/vmscan.c | 2 +-
2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f2da66a..69789dc 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -8,6 +8,24 @@
#include <linux/tracepoint.h>
#include "gfpflags.h"

+#define RECLAIM_WB_ANON 0x0001u
+#define RECLAIM_WB_FILE 0x0002u
+#define RECLAIM_WB_SYNC 0x0004u
+#define RECLAIM_WB_ASYNC 0x0008u
+
+#define show_reclaim_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \
+ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \
+ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \
+ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \
+ ) : "RECLAIM_WB_NONE"
+
+#define trace_reclaim_flags(page, sync) ( \
+ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+ (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+ )
+
TRACE_EVENT(mm_vmscan_kswapd_sleep,

TP_PROTO(int nid),
@@ -158,24 +176,24 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
TRACE_EVENT(mm_vmscan_writepage,

TP_PROTO(struct page *page,
- int sync_io),
+ int reclaim_flags),

- TP_ARGS(page, sync_io),
+ TP_ARGS(page, reclaim_flags),

TP_STRUCT__entry(
__field(struct page *, page)
- __field(int, sync_io)
+ __field(int, reclaim_flags)
),

TP_fast_assign(
__entry->page = page;
- __entry->sync_io = sync_io;
+ __entry->reclaim_flags = reclaim_flags;
),

- TP_printk("page=%p pfn=%lu sync_io=%d",
+ TP_printk("page=%p pfn=%lu flags=%s",
__entry->page,
page_to_pfn(__entry->page),
- __entry->sync_io)
+ show_reclaim_flags(__entry->reclaim_flags))
);

#endif /* _TRACE_VMSCAN_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 63447ff..d83812a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -402,7 +402,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page,
- sync_writeback == PAGEOUT_IO_SYNC);
+ trace_reclaim_flags(page, sync_writeback));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
--
1.7.1

2010-07-30 13:37:34

[permalink] [raw]

Subject: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

There are a number of cases where pages get cleaned but two of concern
to this patch are;
o When dirtying pages, processes may be throttled to clean pages if
dirty_ratio is not met.
o Pages belonging to inodes dirtied longer than
dirty_writeback_centisecs get cleaned.

The problem for reclaim is that dirty pages can reach the end of the LRU if
pages are being dirtied slowly so that neither the throttling or a flusher
thread waking periodically cleans them.

Background flush is already cleaning old or expired inodes first but the
expire time is too far in the future at the time of page reclaim. To mitigate
future problems, this patch wakes flusher threads to clean 4M of data -
an amount that should be manageable without causing congestion in many cases.

Ideally, the background flushers would only be cleaning pages belonging
to the zone being scanned but it's not clear if this would be of benefit
(less IO) or not (potentially less efficient IO if an inode is scattered
across multiple zones).

Signed-off-by: Mel Gorman <[email protected]>
---
mm/vmscan.c | 33 +++++++++++++++++++++++++++++++--
1 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2d2b588..c4c81bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -142,6 +142,18 @@ static DECLARE_RWSEM(shrinker_rwsem);
/* Direct lumpy reclaim waits up to five seconds for background cleaning */
#define MAX_SWAP_CLEAN_WAIT 50

+/*
+ * When reclaim encounters dirty data, wakeup flusher threads to clean
+ * a maximum of 4M of data.
+ */
+#define MAX_WRITEBACK (4194304UL >> PAGE_SHIFT)
+#define WRITEBACK_FACTOR (MAX_WRITEBACK / SWAP_CLUSTER_MAX)
+static inline long nr_writeback_pages(unsigned long nr_dirty)
+{
+ return laptop_mode ? 0 :
+ min(MAX_WRITEBACK, (nr_dirty * WRITEBACK_FACTOR));
+}
+
static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
struct scan_control *sc)
{
@@ -649,12 +661,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
enum pageout_io sync_writeback,
+ int file,
unsigned long *nr_still_dirty)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
unsigned long nr_dirty = 0;
+ unsigned long nr_dirty_seen = 0;
unsigned long nr_reclaimed = 0;

cond_resched();
@@ -748,6 +762,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}

if (PageDirty(page)) {
+ nr_dirty_seen++;
+
/*
* Only kswapd can writeback filesystem pages to
* avoid risk of stack overflow
@@ -875,6 +891,18 @@ keep:

list_splice(&ret_pages, page_list);

+ /*
+ * If reclaim is encountering dirty pages, it may be because
+ * dirty pages are reaching the end of the LRU even though the
+ * dirty_ratio may be satisified. In this case, wake flusher
+ * threads to pro-actively clean up to a maximum of
+ * 4 * SWAP_CLUSTER_MAX amount of data (usually 1/2MB) unless
+ * !may_writepage indicates that this is a direct reclaimer in
+ * laptop mode avoiding disk spin-ups
+ */
+ if (file && nr_dirty_seen && sc->may_writepage)
+ wakeup_flusher_threads(nr_writeback_pages(nr_dirty));
+
*nr_still_dirty = nr_dirty;
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
@@ -1315,7 +1343,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);

nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC,
- &nr_dirty);
+ file, &nr_dirty);

/*
* If specific pages are needed such as with direct reclaiming
@@ -1351,7 +1379,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
count_vm_events(PGDEACTIVATE, nr_active);

nr_reclaimed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC, &nr_dirty);
+ PAGEOUT_IO_SYNC, file,
+ &nr_dirty);
}
}

--
1.7.1

2010-07-30 13:37:36

[permalink] [raw]

Subject: [PATCH 5/6] vmscan: Do not writeback filesystem pages in direct reclaim

When memory is under enough pressure, a process may enter direct
reclaim to free pages in the same manner kswapd does. If a dirty page is
encountered during the scan, this page is written to backing storage using
mapping->writepage. This can result in very deep call stacks, particularly
if the target storage or filesystem are complex. It has already been observed
on XFS that the stack overflows but the problem is not XFS-specific.

This patch prevents direct reclaim writing back filesystem pages by checking
if current is kswapd or the page is anonymous before writing back. If the
dirty pages cannot be written back, they are placed back on the LRU lists
for either background writing by the BDI threads or kswapd. If in direct
lumpy reclaim and dirty pages are encountered, the process will stall for
the background flusher before trying to reclaim the pages again.

As the call-chain for writing anonymous pages is not expected to be deep
and they are not cleaned by flusher threads, anonymous pages are still
written back in direct reclaim.

Signed-off-by: Mel Gorman <[email protected]>
Acked-by: Rik van Riel <[email protected]>
Reviewed-by: Johannes Weiner <[email protected]>
---
mm/vmscan.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d83812a..2d2b588 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,6 +139,9 @@ static DECLARE_RWSEM(shrinker_rwsem);
#define scanning_global_lru(sc) (1)
#endif

+/* Direct lumpy reclaim waits up to five seconds for background cleaning */
+#define MAX_SWAP_CLEAN_WAIT 50
+
static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
struct scan_control *sc)
{
@@ -645,11 +648,13 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
*/
static unsigned long shrink_page_list(struct list_head *page_list,
struct scan_control *sc,
- enum pageout_io sync_writeback)
+ enum pageout_io sync_writeback,
+ unsigned long *nr_still_dirty)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
unsigned long nr_reclaimed = 0;

cond_resched();
@@ -743,6 +748,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}

if (PageDirty(page)) {
+ /*
+ * Only kswapd can writeback filesystem pages to
+ * avoid risk of stack overflow
+ */
+ if (page_is_file_cache(page) && !current_is_kswapd()) {
+ nr_dirty++;
+ goto keep_locked;
+ }
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -860,6 +874,8 @@ keep:
free_page_list(&free_pages);

list_splice(&ret_pages, page_list);
+
+ *nr_still_dirty = nr_dirty;
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1242,12 +1258,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
struct scan_control *sc, int priority, int file)
{
LIST_HEAD(page_list);
+ LIST_HEAD(putback_list);
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
unsigned long nr_active;
unsigned long nr_anon;
unsigned long nr_file;
+ unsigned long nr_dirty;

while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1296,28 +1314,49 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,

spin_unlock_irq(&zone->lru_lock);

- nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC,
+ &nr_dirty);

/*
- * If we are direct reclaiming for contiguous pages and we do
+ * If specific pages are needed such as with direct reclaiming
+ * for contiguous pages or for memory containers and we do
* not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
+ * for IO to complete. This will stall callers that require
+ * specific pages but it should be acceptable to the caller
*/
- if (nr_reclaimed < nr_taken && !current_is_kswapd() &&
- sc->lumpy_reclaim_mode) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ if (sc->may_writepage && !current_is_kswapd() &&
+ (sc->lumpy_reclaim_mode || sc->mem_cgroup)) {
+ int dirty_retry = MAX_SWAP_CLEAN_WAIT;

- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, NULL);
- count_vm_events(PGDEACTIVATE, nr_active);
+ while (nr_reclaimed < nr_taken && nr_dirty && dirty_retry--) {
+ struct page *page, *tmp;
+
+ /* Take off the clean pages marked for activation */
+ list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ if (PageDirty(page) || PageWriteback(page))
+ continue;
+
+ list_del(&page->lru);
+ list_add(&page->lru, &putback_list);
+ }
+
+ wakeup_flusher_threads(laptop_mode ? 0 : nr_dirty);
+ congestion_wait(BLK_RW_ASYNC, HZ/10);

- nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ /*
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
+ */
+ nr_active = clear_active_flags(&page_list, NULL);
+ count_vm_events(PGDEACTIVATE, nr_active);
+
+ nr_reclaimed += shrink_page_list(&page_list, sc,
+ PAGEOUT_IO_SYNC, &nr_dirty);
+ }
}

+ list_splice(&putback_list, &page_list);
+
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
--
1.7.1

2010-07-30 13:38:05

[permalink] [raw]

Subject: [PATCH 4/6] vmscan: tracing: Correct units in post-processing script

The post-processing script is reporting the wrong units. Correct it. This
patch updates vmscan-tracing-add-trace-event-when-a-page-is-written.patch
to include that information. The patches can be merged together.

Signed-off-by: Mel Gorman <[email protected]>
---
.../trace/postprocess/trace-vmscan-postprocess.pl | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index f87f56e..f1b70a8 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -584,7 +584,7 @@ sub dump_stats {
print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n";
print "Direct reclaim write anon async I/O: $total_direct_writepage_anon_async\n";
print "Wake kswapd requests: $total_wakeup_kswapd\n";
- printf "Time stalled direct reclaim: %-1.2f ms\n", $total_direct_latency;
+ printf "Time stalled direct reclaim: %-1.2f seconds\n", $total_direct_latency;
print "\n";
print "Kswapd wakeups: $total_kswapd_wake\n";
print "Kswapd pages scanned: $total_kswapd_nr_scanned\n";
@@ -592,7 +592,7 @@ sub dump_stats {
print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n";
print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n";
print "Kswapd reclaim write anon async I/O: $total_kswapd_writepage_anon_async\n";
- printf "Time kswapd awake: %-1.2f ms\n", $total_kswapd_latency;
+ printf "Time kswapd awake: %-1.2f seconds\n", $total_kswapd_latency;
}

sub aggregate_perprocesspid() {
--
1.7.1

2010-07-30 13:38:24

[permalink] [raw]

Subject: [PATCH 3/6] vmscan: tracing: Update post-processing script to distinguish between anon and file IO from page reclaim

It is useful to distinguish between IO for anon and file pages. This patch
updates
vmscan-tracing-add-a-postprocessing-script-for-reclaim-related-ftrace-events.patch
so the post-processing script can handle the additional information.

Signed-off-by: Mel Gorman <[email protected]>
---
.../trace/postprocess/trace-vmscan-postprocess.pl | 96 +++++++++++++-------
1 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index d1ddc33..f87f56e 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -21,9 +21,12 @@ use constant MM_VMSCAN_KSWAPD_SLEEP => 4;
use constant MM_VMSCAN_LRU_SHRINK_ACTIVE => 5;
use constant MM_VMSCAN_LRU_SHRINK_INACTIVE => 6;
use constant MM_VMSCAN_LRU_ISOLATE => 7;
-use constant MM_VMSCAN_WRITEPAGE_SYNC => 8;
-use constant MM_VMSCAN_WRITEPAGE_ASYNC => 9;
-use constant EVENT_UNKNOWN => 10;
+use constant MM_VMSCAN_WRITEPAGE_FILE_SYNC => 8;
+use constant MM_VMSCAN_WRITEPAGE_ANON_SYNC => 9;
+use constant MM_VMSCAN_WRITEPAGE_FILE_ASYNC => 10;
+use constant MM_VMSCAN_WRITEPAGE_ANON_ASYNC => 11;
+use constant MM_VMSCAN_WRITEPAGE_ASYNC => 12;
+use constant EVENT_UNKNOWN => 13;

# Per-order events
use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER => 11;
@@ -55,9 +58,11 @@ my $opt_read_procstat;
my $total_wakeup_kswapd;
my ($total_direct_reclaim, $total_direct_nr_scanned);
my ($total_direct_latency, $total_kswapd_latency);
-my ($total_direct_writepage_sync, $total_direct_writepage_async);
+my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async);
+my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async);
my ($total_kswapd_nr_scanned, $total_kswapd_wake);
-my ($total_kswapd_writepage_sync, $total_kswapd_writepage_async);
+my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async);
+my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async);

# Catch sigint and exit on request
my $sigint_report = 0;
@@ -101,7 +106,7 @@ my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)';
my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)';
my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
-my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) sync_io=([0-9]*)';
+my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';

# Dyanically discovered regex
my $regex_direct_begin;
@@ -209,7 +214,7 @@ $regex_lru_shrink_active = generate_traceevent_regex(
$regex_writepage = generate_traceevent_regex(
"vmscan/mm_vmscan_writepage",
$regex_writepage_default,
- "page", "pfn", "sync_io");
+ "page", "pfn", "flags");

sub read_statline($) {
my $pid = $_[0];
@@ -379,11 +384,27 @@ EVENT_PROCESS:
next;
}

- my $sync_io = $3;
+ my $flags = $3;
+ my $file = 0;
+ my $sync_io = 0;
+ if ($flags =~ /RECLAIM_WB_FILE/) {
+ $file = 1;
+ }
+ if ($flags =~ /RECLAIM_WB_SYNC/) {
+ $sync_io = 1;
+ }
if ($sync_io) {
- $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC}++;
+ if ($file) {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}++;
+ } else {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}++;
+ }
} else {
- $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC}++;
+ if ($file) {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}++;
+ } else {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}++;
+ }
}
} else {
$perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
@@ -427,7 +448,7 @@ sub dump_stats {
while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] ||
defined $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) {

- if ($stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+ if ($stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
printf("%s ", $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) if !$opt_ignorepid;
my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
$total_direct_latency += $latency;
@@ -454,8 +475,11 @@ sub dump_stats {
$total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
$total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
$total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
- $total_direct_writepage_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
- $total_direct_writepage_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+ $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+
+ $total_direct_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};

my $index = 0;
my $this_reclaim_delay = 0;
@@ -470,8 +494,8 @@ sub dump_stats {
$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN},
$stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD},
$stats{$process_pid}->{HIGH_NR_SCANNED},
- $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC},
- $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC},
$this_reclaim_delay / 1000);

if ($stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
@@ -515,16 +539,18 @@ sub dump_stats {

$total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
$total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
- $total_kswapd_writepage_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
- $total_kswapd_writepage_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+ $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+ $total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};

printf("%-" . $max_strlen . "s %8d %10d %8u %8i %8u",
$process_pid,
$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE},
$stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP},
$stats{$process_pid}->{HIGH_NR_SCANNED},
- $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC},
- $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC});
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC});

if ($stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
print " ";
@@ -551,18 +577,22 @@ sub dump_stats {
$total_direct_latency /= 1000;
$total_kswapd_latency /= 1000;
print "\nSummary\n";
- print "Direct reclaims: $total_direct_reclaim\n";
- print "Direct reclaim pages scanned: $total_direct_nr_scanned\n";
- print "Direct reclaim write sync I/O: $total_direct_writepage_sync\n";
- print "Direct reclaim write async I/O: $total_direct_writepage_async\n";
- print "Wake kswapd requests: $total_wakeup_kswapd\n";
- printf "Time stalled direct reclaim: %-1.2f ms\n", $total_direct_latency;
+ print "Direct reclaims: $total_direct_reclaim\n";
+ print "Direct reclaim pages scanned: $total_direct_nr_scanned\n";
+ print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n";
+ print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n";
+ print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n";
+ print "Direct reclaim write anon async I/O: $total_direct_writepage_anon_async\n";
+ print "Wake kswapd requests: $total_wakeup_kswapd\n";
+ printf "Time stalled direct reclaim: %-1.2f ms\n", $total_direct_latency;
print "\n";
- print "Kswapd wakeups: $total_kswapd_wake\n";
- print "Kswapd pages scanned: $total_kswapd_nr_scanned\n";
- print "Kswapd reclaim write sync I/O: $total_kswapd_writepage_sync\n";
- print "Kswapd reclaim write async I/O: $total_kswapd_writepage_async\n";
- printf "Time kswapd awake: %-1.2f ms\n", $total_kswapd_latency;
+ print "Kswapd wakeups: $total_kswapd_wake\n";
+ print "Kswapd pages scanned: $total_kswapd_nr_scanned\n";
+ print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n";
+ print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n";
+ print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n";
+ print "Kswapd reclaim write anon async I/O: $total_kswapd_writepage_anon_async\n";
+ printf "Time kswapd awake: %-1.2f ms\n", $total_kswapd_latency;
}

sub aggregate_perprocesspid() {
@@ -582,8 +612,10 @@ sub aggregate_perprocesspid() {
$perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
$perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP};
$perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED};
- $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
- $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC};

for (my $order = 0; $order < 20; $order++) {
$perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
--
1.7.1

2010-07-30 13:38:31

[permalink] [raw]

Subject: [PATCH 1/6] vmscan: tracing: Roll up of patches currently in mmotm

This is a roll-up of patches currently in mmotm related to stack reduction and
tracing reclaim. It is based on 2.6.35-rc6 and included for the convenience
of testing.

No signed off required.
---
.../trace/postprocess/trace-vmscan-postprocess.pl | 654 ++++++++++++++++++++
include/linux/memcontrol.h | 5 -
include/linux/mmzone.h | 15 -
include/trace/events/gfpflags.h | 37 ++
include/trace/events/kmem.h | 38 +--
include/trace/events/vmscan.h | 184 ++++++
mm/memcontrol.c | 31 -
mm/page_alloc.c | 2 -
mm/vmscan.c | 429 +++++++-------
mm/vmstat.c | 2 -
10 files changed, 1095 insertions(+), 302 deletions(-)
create mode 100644 Documentation/trace/postprocess/trace-vmscan-postprocess.pl
create mode 100644 include/trace/events/gfpflags.h
create mode 100644 include/trace/events/vmscan.h

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
new file mode 100644
index 0000000..d1ddc33
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -0,0 +1,654 @@
+#!/usr/bin/perl
+# This is a POC for reading the text representation of trace output related to
+# page reclaim. It makes an attempt to extract some high-level information on
+# what is going on. The accuracy of the parser may vary
+#
+# Example usage: trace-vmscan-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
+# other options
+# --read-procstat If the trace lacks process info, get it from /proc
+# --ignore-pid Aggregate processes of the same name together
+#
+# Copyright (c) IBM Corporation 2009
+# Author: Mel Gorman <[email protected]>
+use strict;
+use Getopt::Long;
+
+# Tracepoint events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN => 1;
+use constant MM_VMSCAN_DIRECT_RECLAIM_END => 2;
+use constant MM_VMSCAN_KSWAPD_WAKE => 3;
+use constant MM_VMSCAN_KSWAPD_SLEEP => 4;
+use constant MM_VMSCAN_LRU_SHRINK_ACTIVE => 5;
+use constant MM_VMSCAN_LRU_SHRINK_INACTIVE => 6;
+use constant MM_VMSCAN_LRU_ISOLATE => 7;
+use constant MM_VMSCAN_WRITEPAGE_SYNC => 8;
+use constant MM_VMSCAN_WRITEPAGE_ASYNC => 9;
+use constant EVENT_UNKNOWN => 10;
+
+# Per-order events
+use constant MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER => 11;
+use constant MM_VMSCAN_WAKEUP_KSWAPD_PERORDER => 12;
+use constant MM_VMSCAN_KSWAPD_WAKE_PERORDER => 13;
+use constant HIGH_KSWAPD_REWAKEUP_PERORDER => 14;
+
+# Constants used to track state
+use constant STATE_DIRECT_BEGIN => 15;
+use constant STATE_DIRECT_ORDER => 16;
+use constant STATE_KSWAPD_BEGIN => 17;
+use constant STATE_KSWAPD_ORDER => 18;
+
+# High-level events extrapolated from tracepoints
+use constant HIGH_DIRECT_RECLAIM_LATENCY => 19;
+use constant HIGH_KSWAPD_LATENCY => 20;
+use constant HIGH_KSWAPD_REWAKEUP => 21;
+use constant HIGH_NR_SCANNED => 22;
+use constant HIGH_NR_TAKEN => 23;
+use constant HIGH_NR_RECLAIM => 24;
+use constant HIGH_NR_CONTIG_DIRTY => 25;
+
+my %perprocesspid;
+my %perprocess;
+my %last_procmap;
+my $opt_ignorepid;
+my $opt_read_procstat;
+
+my $total_wakeup_kswapd;
+my ($total_direct_reclaim, $total_direct_nr_scanned);
+my ($total_direct_latency, $total_kswapd_latency);
+my ($total_direct_writepage_sync, $total_direct_writepage_async);
+my ($total_kswapd_nr_scanned, $total_kswapd_wake);
+my ($total_kswapd_writepage_sync, $total_kswapd_writepage_async);
+
+# Catch sigint and exit on request
+my $sigint_report = 0;
+my $sigint_exit = 0;
+my $sigint_pending = 0;
+my $sigint_received = 0;
+sub sigint_handler {
+ my $current_time = time;
+ if ($current_time - 2 > $sigint_received) {
+ print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
+ $sigint_report = 1;
+ } else {
+ if (!$sigint_exit) {
+ print "Second SIGINT received quickly, exiting\n";
+ }
+ $sigint_exit++;
+ }
+
+ if ($sigint_exit > 3) {
+ print "Many SIGINTs received, exiting now without report\n";
+ exit;
+ }
+
+ $sigint_received = $current_time;
+ $sigint_pending = 1;
+}
+$SIG{INT} = "sigint_handler";
+
+# Parse command line options
+GetOptions(
+ 'ignore-pid' => \$opt_ignorepid,
+ 'read-procstat' => \$opt_read_procstat,
+);
+
+# Defaults for dynamically discovered regex's
+my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flags=([A-Z_|]*)';
+my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
+my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
+my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
+my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
+my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)';
+my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)';
+my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
+my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) sync_io=([0-9]*)';
+
+# Dyanically discovered regex
+my $regex_direct_begin;
+my $regex_direct_end;
+my $regex_kswapd_wake;
+my $regex_kswapd_sleep;
+my $regex_wakeup_kswapd;
+my $regex_lru_isolate;
+my $regex_lru_shrink_inactive;
+my $regex_lru_shrink_active;
+my $regex_writepage;
+
+# Static regex used. Specified like this for readability and for use with /o
+# (process_pid) (cpus ) ( time ) (tpoint ) (details)
+my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
+my $regex_statname = '[-0-9]*\s$(.*)$.*';
+my $regex_statppid = '[-0-9]*\s$.*$\s[A-Za-z]\s([0-9]*).*';
+
+sub generate_traceevent_regex {
+ my $event = shift;
+ my $default = shift;
+ my $regex;
+
+ # Read the event format or use the default
+ if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
+ print("WARNING: Event $event format string not found\n");
+ return $default;
+ } else {
+ my $line;
+ while (!eof(FORMAT)) {
+ $line = <FORMAT>;
+ $line =~ s/, REC->.*//;
+ if ($line =~ /^print fmt:\s"(.*)".*/) {
+ $regex = $1;
+ $regex =~ s/%s/$[0-9a-zA-Z|_]*$/g;
+ $regex =~ s/%p/$[0-9a-f]*$/g;
+ $regex =~ s/%d/$[-0-9]*$/g;
+ $regex =~ s/%ld/$[-0-9]*$/g;
+ $regex =~ s/%lu/$[0-9]*$/g;
+ }
+ }
+ }
+
+ # Can't handle the print_flags stuff but in the context of this
+ # script, it really doesn't matter
+ $regex =~ s/$REC.*$ \? __print_flags.*//;
+
+ # Verify fields are in the right order
+ my $tuple;
+ foreach $tuple (split /\s/, $regex) {
+ my ($key, $value) = split(/=/, $tuple);
+ my $expected = shift;
+ if ($key ne $expected) {
+ print("WARNING: Format not as expected for event $event '$key' != '$expected'\n");
+ $regex =~ s/$key=$(.*)$/$key=$1/;
+ }
+ }
+
+ if (defined shift) {
+ die("Fewer fields than expected in format");
+ }
+
+ return $regex;
+}
+
+$regex_direct_begin = generate_traceevent_regex(
+ "vmscan/mm_vmscan_direct_reclaim_begin",
+ $regex_direct_begin_default,
+ "order", "may_writepage",
+ "gfp_flags");
+$regex_direct_end = generate_traceevent_regex(
+ "vmscan/mm_vmscan_direct_reclaim_end",
+ $regex_direct_end_default,
+ "nr_reclaimed");
+$regex_kswapd_wake = generate_traceevent_regex(
+ "vmscan/mm_vmscan_kswapd_wake",
+ $regex_kswapd_wake_default,
+ "nid", "order");
+$regex_kswapd_sleep = generate_traceevent_regex(
+ "vmscan/mm_vmscan_kswapd_sleep",
+ $regex_kswapd_sleep_default,
+ "nid");
+$regex_wakeup_kswapd = generate_traceevent_regex(
+ "vmscan/mm_vmscan_wakeup_kswapd",
+ $regex_wakeup_kswapd_default,
+ "nid", "zid", "order");
+$regex_lru_isolate = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_isolate",
+ $regex_lru_isolate_default,
+ "isolate_mode", "order",
+ "nr_requested", "nr_scanned", "nr_taken",
+ "contig_taken", "contig_dirty", "contig_failed");
+$regex_lru_shrink_inactive = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_shrink_inactive",
+ $regex_lru_shrink_inactive_default,
+ "nid", "zid",
+ "lru",
+ "nr_scanned", "nr_reclaimed", "priority");
+$regex_lru_shrink_active = generate_traceevent_regex(
+ "vmscan/mm_vmscan_lru_shrink_active",
+ $regex_lru_shrink_active_default,
+ "nid", "zid",
+ "lru",
+ "nr_scanned", "nr_rotated", "priority");
+$regex_writepage = generate_traceevent_regex(
+ "vmscan/mm_vmscan_writepage",
+ $regex_writepage_default,
+ "page", "pfn", "sync_io");
+
+sub read_statline($) {
+ my $pid = $_[0];
+ my $statline;
+
+ if (open(STAT, "/proc/$pid/stat")) {
+ $statline = <STAT>;
+ close(STAT);
+ }
+
+ if ($statline eq '') {
+ $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
+ }
+
+ return $statline;
+}
+
+sub guess_process_pid($$) {
+ my $pid = $_[0];
+ my $statline = $_[1];
+
+ if ($pid == 0) {
+ return "swapper-0";
+ }
+
+ if ($statline !~ /$regex_statname/o) {
+ die("Failed to math stat line for process name :: $statline");
+ }
+ return "$1-$pid";
+}
+
+# Convert sec.usec timestamp format
+sub timestamp_to_ms($) {
+ my $timestamp = $_[0];
+
+ my ($sec, $usec) = split (/\./, $timestamp);
+ return ($sec * 1000) + ($usec / 1000);
+}
+
+sub process_events {
+ my $traceevent;
+ my $process_pid;
+ my $cpus;
+ my $timestamp;
+ my $tracepoint;
+ my $details;
+ my $statline;
+
+ # Read each line of the event log
+EVENT_PROCESS:
+ while ($traceevent = <STDIN>) {
+ if ($traceevent =~ /$regex_traceevent/o) {
+ $process_pid = $1;
+ $timestamp = $3;
+ $tracepoint = $4;
+
+ $process_pid =~ /(.*)-([0-9]*)$/;
+ my $process = $1;
+ my $pid = $2;
+
+ if ($process eq "") {
+ $process = $last_procmap{$pid};
+ $process_pid = "$process-$pid";
+ }
+ $last_procmap{$pid} = $process;
+
+ if ($opt_read_procstat) {
+ $statline = read_statline($pid);
+ if ($opt_read_procstat && $process eq '') {
+ $process_pid = guess_process_pid($pid, $statline);
+ }
+ }
+ } else {
+ next;
+ }
+
+ # Perl Switch() sucks majorly
+ if ($tracepoint eq "mm_vmscan_direct_reclaim_begin") {
+ $timestamp = timestamp_to_ms($timestamp);
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}++;
+ $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN} = $timestamp;
+
+ $details = $5;
+ if ($details !~ /$regex_direct_begin/o) {
+ print "WARNING: Failed to parse mm_vmscan_direct_reclaim_begin as expected\n";
+ print " $details\n";
+ print " $regex_direct_begin\n";
+ next;
+ }
+ my $order = $1;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order]++;
+ $perprocesspid{$process_pid}->{STATE_DIRECT_ORDER} = $order;
+ } elsif ($tracepoint eq "mm_vmscan_direct_reclaim_end") {
+ # Count the event itself
+ my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+ $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_END}++;
+
+ # Record how long direct reclaim took this time
+ if (defined $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN}) {
+ $timestamp = timestamp_to_ms($timestamp);
+ my $order = $perprocesspid{$process_pid}->{STATE_DIRECT_ORDER};
+ my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_DIRECT_BEGIN});
+ $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] = "$order-$latency";
+ }
+ } elsif ($tracepoint eq "mm_vmscan_kswapd_wake") {
+ $details = $5;
+ if ($details !~ /$regex_kswapd_wake/o) {
+ print "WARNING: Failed to parse mm_vmscan_kswapd_wake as expected\n";
+ print " $details\n";
+ print " $regex_kswapd_wake\n";
+ next;
+ }
+
+ my $order = $2;
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER} = $order;
+ if (!$perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN}) {
+ $timestamp = timestamp_to_ms($timestamp);
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}++;
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = $timestamp;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order]++;
+ } else {
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}++;
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order]++;
+ }
+ } elsif ($tracepoint eq "mm_vmscan_kswapd_sleep") {
+
+ # Count the event itself
+ my $index = $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP};
+ $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_SLEEP}++;
+
+ # Record how long kswapd was awake
+ $timestamp = timestamp_to_ms($timestamp);
+ my $order = $perprocesspid{$process_pid}->{STATE_KSWAPD_ORDER};
+ my $latency = ($timestamp - $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN});
+ $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index] = "$order-$latency";
+ $perprocesspid{$process_pid}->{STATE_KSWAPD_BEGIN} = 0;
+ } elsif ($tracepoint eq "mm_vmscan_wakeup_kswapd") {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}++;
+
+ $details = $5;
+ if ($details !~ /$regex_wakeup_kswapd/o) {
+ print "WARNING: Failed to parse mm_vmscan_wakeup_kswapd as expected\n";
+ print " $details\n";
+ print " $regex_wakeup_kswapd\n";
+ next;
+ }
+ my $order = $3;
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order]++;
+ } elsif ($tracepoint eq "mm_vmscan_lru_isolate") {
+ $details = $5;
+ if ($details !~ /$regex_lru_isolate/o) {
+ print "WARNING: Failed to parse mm_vmscan_lru_isolate as expected\n";
+ print " $details\n";
+ print " $regex_lru_isolate/o\n";
+ next;
+ }
+ my $nr_scanned = $4;
+ my $nr_contig_dirty = $7;
+ $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
+ $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty;
+ } elsif ($tracepoint eq "mm_vmscan_writepage") {
+ $details = $5;
+ if ($details !~ /$regex_writepage/o) {
+ print "WARNING: Failed to parse mm_vmscan_writepage as expected\n";
+ print " $details\n";
+ print " $regex_writepage\n";
+ next;
+ }
+
+ my $sync_io = $3;
+ if ($sync_io) {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC}++;
+ } else {
+ $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC}++;
+ }
+ } else {
+ $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
+ }
+
+ if ($sigint_pending) {
+ last EVENT_PROCESS;
+ }
+ }
+}
+
+sub dump_stats {
+ my $hashref = shift;
+ my %stats = %$hashref;
+
+ # Dump per-process stats
+ my $process_pid;
+ my $max_strlen = 0;
+
+ # Get the maximum process name
+ foreach $process_pid (keys %perprocesspid) {
+ my $len = length($process_pid);
+ if ($len > $max_strlen) {
+ $max_strlen = $len;
+ }
+ }
+ $max_strlen += 2;
+
+ # Work out latencies
+ printf("\n") if !$opt_ignorepid;
+ printf("Reclaim latencies expressed as order-latency_in_ms\n") if !$opt_ignorepid;
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[0] &&
+ !$stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[0]) {
+ next;
+ }
+
+ printf "%-" . $max_strlen . "s ", $process_pid if !$opt_ignorepid;
+ my $index = 0;
+ while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index] ||
+ defined $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) {
+
+ if ($stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+ printf("%s ", $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) if !$opt_ignorepid;
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+ $total_direct_latency += $latency;
+ } else {
+ printf("%s ", $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]) if !$opt_ignorepid;
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_KSWAPD_LATENCY}[$index]);
+ $total_kswapd_latency += $latency;
+ }
+ $index++;
+ }
+ print "\n" if !$opt_ignorepid;
+ }
+
+ # Print out process activity
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Time");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Sync-IO", "ASync-IO", "Stalled");
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+ next;
+ }
+
+ $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+ $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+ $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+ $total_direct_writepage_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
+ $total_direct_writepage_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+
+ my $index = 0;
+ my $this_reclaim_delay = 0;
+ while (defined $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]) {
+ my ($dummy, $latency) = split(/-/, $stats{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$index]);
+ $this_reclaim_delay += $latency;
+ $index++;
+ }
+
+ printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8.3f",
+ $process_pid,
+ $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN},
+ $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD},
+ $stats{$process_pid}->{HIGH_NR_SCANNED},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC},
+ $this_reclaim_delay / 1000);
+
+ if ($stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+ if ($count != 0) {
+ print "direct-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+ if ($count != 0) {
+ print "wakeup-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY}) {
+ print " ";
+ my $count = $stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY};
+ if ($count != 0) {
+ print "contig-dirty=$count ";
+ }
+ }
+
+ print "\n";
+ }
+
+ # Print out kswapd activity
+ printf("\n");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages");
+ printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO");
+ foreach $process_pid (keys %stats) {
+
+ if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+ next;
+ }
+
+ $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+ $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED};
+ $total_kswapd_writepage_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
+ $total_kswapd_writepage_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+
+ printf("%-" . $max_strlen . "s %8d %10d %8u %8i %8u",
+ $process_pid,
+ $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE},
+ $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP},
+ $stats{$process_pid}->{HIGH_NR_SCANNED},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC},
+ $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC});
+
+ if ($stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+ if ($count != 0) {
+ print "wake-$order=$count ";
+ }
+ }
+ }
+ if ($stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}) {
+ print " ";
+ for (my $order = 0; $order < 20; $order++) {
+ my $count = $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP_PERORDER}[$order];
+ if ($count != 0) {
+ print "rewake-$order=$count ";
+ }
+ }
+ }
+ printf("\n");
+ }
+
+ # Print out summaries
+ $total_direct_latency /= 1000;
+ $total_kswapd_latency /= 1000;
+ print "\nSummary\n";
+ print "Direct reclaims: $total_direct_reclaim\n";
+ print "Direct reclaim pages scanned: $total_direct_nr_scanned\n";
+ print "Direct reclaim write sync I/O: $total_direct_writepage_sync\n";
+ print "Direct reclaim write async I/O: $total_direct_writepage_async\n";
+ print "Wake kswapd requests: $total_wakeup_kswapd\n";
+ printf "Time stalled direct reclaim: %-1.2f ms\n", $total_direct_latency;
+ print "\n";
+ print "Kswapd wakeups: $total_kswapd_wake\n";
+ print "Kswapd pages scanned: $total_kswapd_nr_scanned\n";
+ print "Kswapd reclaim write sync I/O: $total_kswapd_writepage_sync\n";
+ print "Kswapd reclaim write async I/O: $total_kswapd_writepage_async\n";
+ printf "Time kswapd awake: %-1.2f ms\n", $total_kswapd_latency;
+}
+
+sub aggregate_perprocesspid() {
+ my $process_pid;
+ my $process;
+ undef %perprocess;
+
+ foreach $process_pid (keys %perprocesspid) {
+ $process = $process_pid;
+ $process =~ s/-([0-9])*$//;
+ if ($process eq '') {
+ $process = "NO_PROCESS_NAME";
+ }
+
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN} += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN};
+ $perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE} += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE};
+ $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD};
+ $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP};
+ $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_SYNC};
+ $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ASYNC};
+
+ for (my $order = 0; $order < 20; $order++) {
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN_PERORDER}[$order];
+ $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order];
+ $perprocess{$process}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order] += $perprocesspid{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE_PERORDER}[$order];
+
+ }
+
+ # Aggregate direct reclaim latencies
+ my $wr_index = $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END};
+ my $rd_index = 0;
+ while (defined $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index]) {
+ $perprocess{$process}->{HIGH_DIRECT_RECLAIM_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_DIRECT_RECLAIM_LATENCY}[$rd_index];
+ $rd_index++;
+ $wr_index++;
+ }
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+
+ # Aggregate kswapd latencies
+ my $wr_index = $perprocess{$process}->{MM_VMSCAN_KSWAPD_SLEEP};
+ my $rd_index = 0;
+ while (defined $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index]) {
+ $perprocess{$process}->{HIGH_KSWAPD_LATENCY}[$wr_index] = $perprocesspid{$process_pid}->{HIGH_KSWAPD_LATENCY}[$rd_index];
+ $rd_index++;
+ $wr_index++;
+ }
+ $perprocess{$process}->{MM_VMSCAN_DIRECT_RECLAIM_END} = $wr_index;
+ }
+}
+
+sub report() {
+ if (!$opt_ignorepid) {
+ dump_stats(\%perprocesspid);
+ } else {
+ aggregate_perprocesspid();
+ dump_stats(\%perprocess);
+ }
+}
+
+# Process events or signals until neither is available
+sub signal_loop() {
+ my $sigint_processed;
+ do {
+ $sigint_processed = 0;
+ process_events();
+
+ # Handle pending signals if any
+ if ($sigint_pending) {
+ my $current_time = time;
+
+ if ($sigint_exit) {
+ print "Received exit signal\n";
+ $sigint_pending = 0;
+ }
+ if ($sigint_report) {
+ if ($current_time >= $sigint_received + 2) {
+ report();
+ $sigint_report = 0;
+ $sigint_pending = 0;
+ $sigint_processed = 1;
+ }
+ }
+ }
+ } while ($sigint_pending || $sigint_processed);
+}
+
+signal_loop();
+report();
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9411d32..9f1afd3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
/*
* For memory reclaim.
*/
-extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem);
-extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
- int priority);
-extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
- int priority);
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b4d109e..b578eee 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -348,21 +348,6 @@ struct zone {
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];

/*
- * prev_priority holds the scanning priority for this zone. It is
- * defined as the scanning priority at which we achieved our reclaim
- * target at the previous try_to_free_pages() or balance_pgdat()
- * invocation.
- *
- * We use prev_priority as a measure of how much stress page reclaim is
- * under - it drives the swappiness decision: whether to unmap mapped
- * pages.
- *
- * Access to both this field is quite racy even on uniprocessor. But
- * it is expected to average out OK.
- */
- int prev_priority;
-
- /*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
new file mode 100644
index 0000000..e3615c0
--- /dev/null
+++ b/include/trace/events/gfpflags.h
@@ -0,0 +1,37 @@
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ * GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags) \
+ (flags) ? __print_flags(flags, "|", \
+ {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
+ {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
+ {(unsigned long)GFP_USER, "GFP_USER"}, \
+ {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
+ {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
+ {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
+ {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
+ {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
+ {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
+ {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \
+ {(unsigned long)__GFP_IO, "GFP_IO"}, \
+ {(unsigned long)__GFP_COLD, "GFP_COLD"}, \
+ {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
+ {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
+ {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \
+ {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \
+ {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
+ {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
+ {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
+ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
+ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
+ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
+ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"} \
+ ) : "GFP_NOWAIT"
+
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 3adca0c..a9c87ad 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -6,43 +6,7 @@

#include <linux/types.h>
#include <linux/tracepoint.h>
-
-/*
- * The order of these masks is important. Matching masks will be seen
- * first and the left over flags will end up showing by themselves.
- *
- * For example, if we have GFP_KERNEL before GFP_USER we wil get:
- *
- * GFP_KERNEL|GFP_HARDWALL
- *
- * Thus most bits set go first.
- */
-#define show_gfp_flags(flags) \
- (flags) ? __print_flags(flags, "|", \
- {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \
- {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
- {(unsigned long)GFP_USER, "GFP_USER"}, \
- {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \
- {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \
- {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \
- {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \
- {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \
- {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \
- {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \
- {(unsigned long)__GFP_IO, "GFP_IO"}, \
- {(unsigned long)__GFP_COLD, "GFP_COLD"}, \
- {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \
- {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \
- {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \
- {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \
- {(unsigned long)__GFP_COMP, "GFP_COMP"}, \
- {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \
- {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \
- {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
- {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
- {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
- {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"} \
- ) : "GFP_NOWAIT"
+#include "gfpflags.h"

DECLARE_EVENT_CLASS(kmem_alloc,

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
new file mode 100644
index 0000000..f2da66a
--- /dev/null
+++ b/include/trace/events/vmscan.h
@@ -0,0 +1,184 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vmscan
+
+#if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VMSCAN_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include "gfpflags.h"
+
+TRACE_EVENT(mm_vmscan_kswapd_sleep,
+
+ TP_PROTO(int nid),
+
+ TP_ARGS(nid),
+
+ TP_STRUCT__entry(
+ __field( int, nid )
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ ),
+
+ TP_printk("nid=%d", __entry->nid)
+);
+
+TRACE_EVENT(mm_vmscan_kswapd_wake,
+
+ TP_PROTO(int nid, int order),
+
+ TP_ARGS(nid, order),
+
+ TP_STRUCT__entry(
+ __field( int, nid )
+ __field( int, order )
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ ),
+
+ TP_printk("nid=%d order=%d", __entry->nid, __entry->order)
+);
+
+TRACE_EVENT(mm_vmscan_wakeup_kswapd,
+
+ TP_PROTO(int nid, int zid, int order),
+
+ TP_ARGS(nid, zid, order),
+
+ TP_STRUCT__entry(
+ __field( int, nid )
+ __field( int, zid )
+ __field( int, order )
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->zid = zid;
+ __entry->order = order;
+ ),
+
+ TP_printk("nid=%d zid=%d order=%d",
+ __entry->nid,
+ __entry->zid,
+ __entry->order)
+);
+
+TRACE_EVENT(mm_vmscan_direct_reclaim_begin,
+
+ TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+
+ TP_ARGS(order, may_writepage, gfp_flags),
+
+ TP_STRUCT__entry(
+ __field( int, order )
+ __field( int, may_writepage )
+ __field( gfp_t, gfp_flags )
+ ),
+
+ TP_fast_assign(
+ __entry->order = order;
+ __entry->may_writepage = may_writepage;
+ __entry->gfp_flags = gfp_flags;
+ ),
+
+ TP_printk("order=%d may_writepage=%d gfp_flags=%s",
+ __entry->order,
+ __entry->may_writepage,
+ show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(mm_vmscan_direct_reclaim_end,
+
+ TP_PROTO(unsigned long nr_reclaimed),
+
+ TP_ARGS(nr_reclaimed),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr_reclaimed )
+ ),
+
+ TP_fast_assign(
+ __entry->nr_reclaimed = nr_reclaimed;
+ ),
+
+ TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed)
+);
+
+TRACE_EVENT(mm_vmscan_lru_isolate,
+
+ TP_PROTO(int order,
+ unsigned long nr_requested,
+ unsigned long nr_scanned,
+ unsigned long nr_taken,
+ unsigned long nr_lumpy_taken,
+ unsigned long nr_lumpy_dirty,
+ unsigned long nr_lumpy_failed,
+ int isolate_mode),
+
+ TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode),
+
+ TP_STRUCT__entry(
+ __field(int, order)
+ __field(unsigned long, nr_requested)
+ __field(unsigned long, nr_scanned)
+ __field(unsigned long, nr_taken)
+ __field(unsigned long, nr_lumpy_taken)
+ __field(unsigned long, nr_lumpy_dirty)
+ __field(unsigned long, nr_lumpy_failed)
+ __field(int, isolate_mode)
+ ),
+
+ TP_fast_assign(
+ __entry->order = order;
+ __entry->nr_requested = nr_requested;
+ __entry->nr_scanned = nr_scanned;
+ __entry->nr_taken = nr_taken;
+ __entry->nr_lumpy_taken = nr_lumpy_taken;
+ __entry->nr_lumpy_dirty = nr_lumpy_dirty;
+ __entry->nr_lumpy_failed = nr_lumpy_failed;
+ __entry->isolate_mode = isolate_mode;
+ ),
+
+ TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu",
+ __entry->isolate_mode,
+ __entry->order,
+ __entry->nr_requested,
+ __entry->nr_scanned,
+ __entry->nr_taken,
+ __entry->nr_lumpy_taken,
+ __entry->nr_lumpy_dirty,
+ __entry->nr_lumpy_failed)
+);
+
+TRACE_EVENT(mm_vmscan_writepage,
+
+ TP_PROTO(struct page *page,
+ int sync_io),
+
+ TP_ARGS(page, sync_io),
+
+ TP_STRUCT__entry(
+ __field(struct page *, page)
+ __field(int, sync_io)
+ ),
+
+ TP_fast_assign(
+ __entry->page = page;
+ __entry->sync_io = sync_io;
+ ),
+
+ TP_printk("page=%p pfn=%lu sync_io=%d",
+ __entry->page,
+ page_to_pfn(__entry->page),
+ __entry->sync_io)
+);
+
+#endif /* _TRACE_VMSCAN_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193..31abd1c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -211,8 +211,6 @@ struct mem_cgroup {
*/
spinlock_t reclaim_param_lock;

- int prev_priority; /* for recording reclaim priority */
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -858,35 +856,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret;
}

-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
- int prev_priority;
-
- spin_lock(&mem->reclaim_param_lock);
- prev_priority = mem->prev_priority;
- spin_unlock(&mem->reclaim_param_lock);
-
- return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- if (priority < mem->prev_priority)
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339e..eefc8b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4089,8 +4089,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;

- zone->prev_priority = DEF_PRIORITY;
-
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b..63447ff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@

#include "internal.h"

+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
+ trace_mm_vmscan_writepage(page,
+ sync_writeback == PAGEOUT_IO_SYNC);
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
}

+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+ struct pagevec freed_pvec;
+ struct page *page, *tmp;
+
+ pagevec_init(&freed_pvec, 1);
+
+ list_for_each_entry_safe(page, tmp, free_pages, lru) {
+ list_del(&page->lru);
+ if (!pagevec_add(&freed_pvec, page)) {
+ __pagevec_free(&freed_pvec);
+ pagevec_reinit(&freed_pvec);
+ }
+ }
+
+ pagevec_free(&freed_pvec);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
+ LIST_HEAD(free_pages);
int pgactivate = 0;
unsigned long nr_reclaimed = 0;

cond_resched();

- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
enum page_references references;
struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+
+ /*
+ * Is there need to periodically free_page_list? It would
+ * appear not as the counts should be low
+ */
+ list_add(&page->lru, &free_pages);
continue;

cull_mlocked:
@@ -832,9 +856,10 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+
+ free_page_list(&free_pages);
+
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long *scanned, int order, int mode, int file)
{
unsigned long nr_taken = 0;
+ unsigned long nr_lumpy_taken = 0;
+ unsigned long nr_lumpy_dirty = 0;
+ unsigned long nr_lumpy_failed = 0;
unsigned long scan;

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
nr_taken++;
+ nr_lumpy_taken++;
+ if (PageDirty(cursor_page))
+ nr_lumpy_dirty++;
scan++;
+ } else {
+ if (mode == ISOLATE_BOTH &&
+ page_count(cursor_page))
+ nr_lumpy_failed++;
}
}
}

*scanned = scan;
+
+ trace_mm_vmscan_lru_isolate(order,
+ nr_to_scan, scan,
+ nr_taken,
+ nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+ mode);
return nr_taken;
}

@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
ClearPageActive(page);
nr_active++;
}
- count[lru]++;
+ if (count)
+ count[lru]++;
}

return nr_active;
@@ -1112,174 +1154,177 @@ static int too_many_isolated(struct zone *zone, int file,
}

/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
- * of reclaimed pages
+ * TODO: Try merging with migrations version of putback_lru_pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static noinline_for_stack void
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
+ unsigned long nr_anon, unsigned long nr_file,
+ struct list_head *page_list)
{
- LIST_HEAD(page_list);
+ struct page *page;
struct pagevec pvec;
- unsigned long nr_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);

- while (unlikely(too_many_isolated(zone, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ pagevec_init(&pvec, 1);

- /* We are about to die and free our memory. Return now. */
- if (fatal_signal_pending(current))
- return SWAP_CLUSTER_MAX;
+ /*
+ * Put back any unfreeable pages.
+ */
+ spin_lock(&zone->lru_lock);
+ while (!list_empty(page_list)) {
+ int lru;
+ page = lru_to_page(page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
+ }
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ reclaim_stat->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
}
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);

+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}

- pagevec_init(&pvec, 1);
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+ struct scan_control *sc,
+ unsigned long *nr_anon,
+ unsigned long *nr_file,
+ struct list_head *isolated_list)
+{
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);

- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
+ nr_active = clear_active_flags(isolated_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);

- if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, 0, file);
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- } else {
- nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan,
- sc->order, mode,
- zone, sc->mem_cgroup,
- 0, file);
- /*
- * mem_cgroup_isolate_pages() keeps track of
- * scanned pages on its own.
- */
- }
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);

- if (nr_taken == 0)
- goto done;
+ *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);

- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+ reclaim_stat->recent_scanned[0] += *nr_anon;
+ reclaim_stat->recent_scanned[1] += *nr_file;
+}

- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+/*
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+ struct scan_control *sc, int priority, int file)
+{
+ LIST_HEAD(page_list);
+ unsigned long nr_scanned;
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_taken;
+ unsigned long nr_active;
+ unsigned long nr_anon;
+ unsigned long nr_file;

- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ while (unlikely(too_many_isolated(zone, file, sc))) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);

- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ /* We are about to die and free our memory. Return now. */
+ if (fatal_signal_pending(current))
+ return SWAP_CLUSTER_MAX;
+ }

- spin_unlock_irq(&zone->lru_lock);

- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);

+ if (scanning_global_lru(sc)) {
+ nr_taken = isolate_pages_global(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, 0, file);
+ zone->pages_scanned += nr_scanned;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ nr_scanned);
+ } else {
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ sc->lumpy_reclaim_mode ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup,
+ 0, file);
/*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
+ * mem_cgroup_isolate_pages() keeps track of
+ * scanned pages on its own.
*/
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- sc->lumpy_reclaim_mode) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ }

- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
+ if (nr_taken == 0) {
+ spin_unlock_irq(&zone->lru_lock);
+ return 0;
+ }

- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);

- nr_reclaimed += nr_freed;
+ spin_unlock_irq(&zone->lru_lock);

- local_irq_disable();
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+ nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /*
+ * If we are direct reclaiming for contiguous pages and we do
+ * not reclaim everything in the list, try again and wait
+ * for IO to complete. This will stall high-order allocations
+ * but that should be acceptable to the caller
+ */
+ if (nr_reclaimed < nr_taken && !current_is_kswapd() &&
+ sc->lumpy_reclaim_mode) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);

- spin_lock(&zone->lru_lock);
/*
- * Put back any unfreeable pages.
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
*/
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ nr_active = clear_active_flags(&page_list, NULL);
+ count_vm_events(PGDEACTIVATE, nr_active);

- } while (nr_scanned < max_scan);
+ nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ }

-done:
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- return nr_reclaimed;
-}
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);

-/*
- * We are about to scan this zone at a certain priority level. If that priority
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone. This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
- if (priority < zone->prev_priority)
- zone->prev_priority = priority;
+ putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ return nr_reclaimed;
}

/*
@@ -1729,13 +1774,12 @@ static void shrink_zone(int priority, struct zone *zone,
static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
bool all_unreclaimable = true;

- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- sc->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1745,17 +1789,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
if (scanning_global_lru(sc)) {
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- note_zone_scanning_priority(zone, priority);
-
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- } else {
- /*
- * Ignore cpuset limitation here. We just want to reduce
- * # of used pages by us regardless of memory shortage.
- */
- mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
- priority);
}

shrink_zone(priority, zone, sc);
@@ -1787,10 +1822,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- unsigned long lru_pages = 0;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
unsigned long writeback_threshold;

get_mems_allowed();
@@ -1798,18 +1831,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
- /*
- * mem_cgroup will not do shrink_slab.
- */
- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- lru_pages += zone_reclaimable_pages(zone);
- }
- }

for (priority = DEF_PRIORITY; priority >= 0; priority--) {
sc->nr_scanned = 0;
@@ -1821,6 +1842,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
* over limit cgroups
*/
if (scanning_global_lru(sc)) {
+ unsigned long lru_pages = 0;
+ for_each_zone_zonelist(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ lru_pages += zone_reclaimable_pages(zone);
+ }
+
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1891,6 @@ out:
if (priority < 0)
priority = 0;

- if (scanning_global_lru(sc)) {
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- zone->prev_priority = priority;
- }
- } else
- mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-
delayacct_freepages_end();
put_mems_allowed();

@@ -1888,6 +1907,7 @@ out:
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
+ unsigned long nr_reclaimed;
struct scan_control sc = {
.gfp_mask = gfp_mask,
.may_writepage = !laptop_mode,
@@ -1900,7 +1920,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.nodemask = nodemask,
};

- return do_try_to_free_pages(zonelist, &sc);
+ trace_mm_vmscan_direct_reclaim_begin(order,
+ sc.may_writepage,
+ gfp_mask);
+
+ nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
+ trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+
+ return nr_reclaimed;
}

#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -2028,22 +2056,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.order = order,
.mem_cgroup = NULL,
};
- /*
- * temp_priority is used to remember the scanning priority at which
- * this zone was successfully refilled to
- * free_pages == high_wmark_pages(zone).
- */
- int temp_priority[MAX_NR_ZONES];
-
loop_again:
total_scanned = 0;
sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);

- for (i = 0; i < pgdat->nr_zones; i++)
- temp_priority[i] = DEF_PRIORITY;
-
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
@@ -2111,9 +2129,7 @@ loop_again:
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;

- temp_priority[i] = priority;
sc.nr_scanned = 0;
- note_zone_scanning_priority(zone, priority);

nid = pgdat->node_id;
zid = zone_idx(zone);
@@ -2186,16 +2202,6 @@ loop_again:
break;
}
out:
- /*
- * Note within each zone the priority level at which this zone was
- * brought into a happy state. So that the next thread which scans this
- * zone will start out at that priority level.
- */
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->prev_priority = temp_priority[i];
- }
if (!all_zones_ok) {
cond_resched();

@@ -2299,9 +2305,10 @@ static int kswapd(void *p)
* premature sleep. If not, then go fully
* to sleep until explicitly woken up
*/
- if (!sleeping_prematurely(pgdat, order, remaining))
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
schedule();
- else {
+ } else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
@@ -2321,8 +2328,10 @@ static int kswapd(void *p)
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
- if (!ret)
+ if (!ret) {
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balance_pgdat(pgdat, order);
+ }
}
return 0;
}
@@ -2342,6 +2351,7 @@ void wakeup_kswapd(struct zone *zone, int order)
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2611,7 +2621,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
- note_zone_scanning_priority(zone, priority);
shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941..5c0b1b6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n all_unreclaimable: %u"
- "\n prev_priority: %i"
"\n start_pfn: %lu"
"\n inactive_ratio: %u",
zone->all_unreclaimable,
- zone->prev_priority,
zone->zone_start_pfn,
zone->inactive_ratio);
seq_putc(m, '\n');
--
1.7.1

2010-07-30 14:04:40

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH 1/6] vmscan: tracing: Roll up of patches currently in mmotm

On Fri, Jul 30, 2010 at 02:36:55PM +0100, Mel Gorman wrote:
> This is a roll-up of patches currently in mmotm related to stack reduction and
> tracing reclaim. It is based on 2.6.35-rc6 and included for the convenience
> of testing.
>
> No signed off required.
> ---
> .../trace/postprocess/trace-vmscan-postprocess.pl | 654 ++++++++++++++++++++

I have the feeling you've made an ad-hoc post processing script that seems
to rewrite all the format parsing, debugfs, stream handling, etc... we
have that in perf tools already.

May be you weren't aware of what we have in perf in terms of scripting support.

First, launch perf list and spot the events you're interested in, let's
say you're interested in irqs:

$ perf list
[...]
irq:irq_handler_entry [Tracepoint event]
irq:irq_handler_exit [Tracepoint event]
irq:softirq_entry [Tracepoint event]
irq:softirq_exit [Tracepoint event]
[...]

Now do a trace record:

# perf record -e irq:irq_handler_entry -e irq:irq_handler_exit -e irq:softirq_entry -e irq:softirq_exit cmd

or more simple:

# perf record -e irq:* cmd

You can use -a instead of cmd for wide tracing.

Now generate a perf parsing script on top of these traces:

# perf trace -g perl
generated Perl script: perf-trace.pl

Fill up the trace handlers inside perf-trace.pl and just run it:

# perf trace -s perf-trace.pl

Once ready, you can place your script in the script directory.

2010-07-30 14:12:35

[permalink] [raw]

Subject: Re: [PATCH 1/6] vmscan: tracing: Roll up of patches currently in mmotm

On Fri, Jul 30, 2010 at 04:04:42PM +0200, Frederic Weisbecker wrote:
> On Fri, Jul 30, 2010 at 02:36:55PM +0100, Mel Gorman wrote:
> > This is a roll-up of patches currently in mmotm related to stack reduction and
> > tracing reclaim. It is based on 2.6.35-rc6 and included for the convenience
> > of testing.
> >
> > No signed off required.
> > ---
> > .../trace/postprocess/trace-vmscan-postprocess.pl | 654 ++++++++++++++++++++
>
> I have the feeling you've made an ad-hoc post processing script that seems
> to rewrite all the format parsing, debugfs, stream handling, etc... we
> have that in perf tools already.
>

It's an hoc adaption of trace-pagealloc-postprocess.pl which was developed
before the perf scripting report. It's a bit klunky.

> May be you weren't aware of what we have in perf in terms of scripting support.
>

I'm aware, I just haven't gotten around to adapting what the script does
to the perf scripting support. The existance of the script I have means
people can reproduce my results without having to wait for me to rewrite
the post-processing scripts for perf.

> First, launch perf list and spot the events you're interested in, let's
> say you're interested in irqs:
>
> $ perf list
> [...]
> irq:irq_handler_entry [Tracepoint event]
> irq:irq_handler_exit [Tracepoint event]
> irq:softirq_entry [Tracepoint event]
> irq:softirq_exit [Tracepoint event]
> [...]
>
> Now do a trace record:
>
> # perf record -e irq:irq_handler_entry -e irq:irq_handler_exit -e irq:softirq_entry -e irq:softirq_exit cmd
>
> or more simple:
>
> # perf record -e irq:* cmd
>
> You can use -a instead of cmd for wide tracing.
>
> Now generate a perf parsing script on top of these traces:
>
> # perf trace -g perl
> generated Perl script: perf-trace.pl
>
> Fill up the trace handlers inside perf-trace.pl and just run it:
>
> # perf trace -s perf-trace.pl
>
> Once ready, you can place your script in the script directory.
>

Ultimately, the post-processing scripts should be adapted to perf but it
could be a while before I get around to it.

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2010-07-30 14:15:48

by Frederic Weisbecker

[permalink] [raw]

Subject: Re: [PATCH 1/6] vmscan: tracing: Roll up of patches currently in mmotm

On Fri, Jul 30, 2010 at 03:12:18PM +0100, Mel Gorman wrote:
> On Fri, Jul 30, 2010 at 04:04:42PM +0200, Frederic Weisbecker wrote:
> > On Fri, Jul 30, 2010 at 02:36:55PM +0100, Mel Gorman wrote:
> > > This is a roll-up of patches currently in mmotm related to stack reduction and
> > > tracing reclaim. It is based on 2.6.35-rc6 and included for the convenience
> > > of testing.
> > >
> > > No signed off required.
> > > ---
> > > .../trace/postprocess/trace-vmscan-postprocess.pl | 654 ++++++++++++++++++++
> >
> > I have the feeling you've made an ad-hoc post processing script that seems
> > to rewrite all the format parsing, debugfs, stream handling, etc... we
> > have that in perf tools already.
> >
>
> It's an hoc adaption of trace-pagealloc-postprocess.pl which was developed
> before the perf scripting report. It's a bit klunky.
>
> > May be you weren't aware of what we have in perf in terms of scripting support.
> >
>
> I'm aware, I just haven't gotten around to adapting what the script does
> to the perf scripting support. The existance of the script I have means
> people can reproduce my results without having to wait for me to rewrite
> the post-processing scripts for perf.
>
> > First, launch perf list and spot the events you're interested in, let's
> > say you're interested in irqs:
> >
> > $ perf list
> > [...]
> > irq:irq_handler_entry [Tracepoint event]
> > irq:irq_handler_exit [Tracepoint event]
> > irq:softirq_entry [Tracepoint event]
> > irq:softirq_exit [Tracepoint event]
> > [...]
> >
> > Now do a trace record:
> >
> > # perf record -e irq:irq_handler_entry -e irq:irq_handler_exit -e irq:softirq_entry -e irq:softirq_exit cmd
> >
> > or more simple:
> >
> > # perf record -e irq:* cmd
> >
> > You can use -a instead of cmd for wide tracing.
> >
> > Now generate a perf parsing script on top of these traces:
> >
> > # perf trace -g perl
> > generated Perl script: perf-trace.pl
> >
> > Fill up the trace handlers inside perf-trace.pl and just run it:
> >
> > # perf trace -s perf-trace.pl
> >
> > Once ready, you can place your script in the script directory.
> >
>
> Ultimately, the post-processing scripts should be adapted to perf but it
> could be a while before I get around to it.

Ok, I thought it was a brand new thing. No problem then.

2010-07-30 22:16:21

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Fri, 30 Jul 2010 14:37:00 +0100
Mel Gorman <[email protected]> wrote:

> There are a number of cases where pages get cleaned but two of concern
> to this patch are;
> o When dirtying pages, processes may be throttled to clean pages if
> dirty_ratio is not met.

Ambiguous. I assume you meant "if dirty_ratio is exceeded".

> o Pages belonging to inodes dirtied longer than
> dirty_writeback_centisecs get cleaned.
>
> The problem for reclaim is that dirty pages can reach the end of the LRU if
> pages are being dirtied slowly so that neither the throttling or a flusher
> thread waking periodically cleans them.
>
> Background flush is already cleaning old or expired inodes first but the
> expire time is too far in the future at the time of page reclaim. To mitigate
> future problems, this patch wakes flusher threads to clean 4M of data -
> an amount that should be manageable without causing congestion in many cases.
>
> Ideally, the background flushers would only be cleaning pages belonging
> to the zone being scanned but it's not clear if this would be of benefit
> (less IO) or not (potentially less efficient IO if an inode is scattered
> across multiple zones).
>

Sigh. We have sooo many problems with writeback and latency. Read
https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep. Everyone's
running away from the issue and here we are adding code to solve some
alleged stack-overflow problem which seems to be largely a non-problem,
by making changes which may worsen our real problems.

direct-reclaim wants to write a dirty page because that page is in the
zone which the caller wants to allcoate from! Telling the flusher
threads to perform generic writeback will sometimes cause them to just
gum the disk up with pages from different zones, making it even
harder/slower to allocate a page from the zones we're interested in,
no?

If/when that happens, the problem will be rare, subtle, will take a
long time to get reported and will take years to understand and fix and
will probably be reported in the monster bug report which everyone's
hiding from anyway.

2010-07-30 22:41:28

by Trond Myklebust

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Fri, 2010-07-30 at 15:06 -0700, Andrew Morton wrote:
> On Fri, 30 Jul 2010 14:37:00 +0100
> Mel Gorman <[email protected]> wrote:
>
> > There are a number of cases where pages get cleaned but two of concern
> > to this patch are;
> > o When dirtying pages, processes may be throttled to clean pages if
> > dirty_ratio is not met.
>
> Ambiguous. I assume you meant "if dirty_ratio is exceeded".
>
> > o Pages belonging to inodes dirtied longer than
> > dirty_writeback_centisecs get cleaned.
> >
> > The problem for reclaim is that dirty pages can reach the end of the LRU if
> > pages are being dirtied slowly so that neither the throttling or a flusher
> > thread waking periodically cleans them.
> >
> > Background flush is already cleaning old or expired inodes first but the
> > expire time is too far in the future at the time of page reclaim. To mitigate
> > future problems, this patch wakes flusher threads to clean 4M of data -
> > an amount that should be manageable without causing congestion in many cases.
> >
> > Ideally, the background flushers would only be cleaning pages belonging
> > to the zone being scanned but it's not clear if this would be of benefit
> > (less IO) or not (potentially less efficient IO if an inode is scattered
> > across multiple zones).
> >
>
> Sigh. We have sooo many problems with writeback and latency. Read
> https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep. Everyone's
> running away from the issue and here we are adding code to solve some
> alleged stack-overflow problem which seems to be largely a non-problem,
> by making changes which may worsen our real problems.
>
> direct-reclaim wants to write a dirty page because that page is in the
> zone which the caller wants to allcoate from! Telling the flusher
> threads to perform generic writeback will sometimes cause them to just
> gum the disk up with pages from different zones, making it even
> harder/slower to allocate a page from the zones we're interested in,
> no?
>
> If/when that happens, the problem will be rare, subtle, will take a
> long time to get reported and will take years to understand and fix and
> will probably be reported in the monster bug report which everyone's
> hiding from anyway.

There is that, and then there are issues with the VM simply lying to the
filesystems.

See https://bugzilla.kernel.org/show_bug.cgi?id=16056

Which basically boils down to the following: kswapd tells the filesystem
that it is quite safe to do GFP_KERNEL allocations in pageouts and as
part of try_to_release_page().

In the case of pageouts, it does set the 'WB_SYNC_NONE', 'nonblocking'
and 'for_reclaim' flags in the writeback_control struct, and so the
filesystem has at least some hint that it should do non-blocking i/o.

However if you trust the GFP_KERNEL flag in try_to_release_page() then
the kernel can and will deadlock, and so I had to add in a hack
specifically to tell the NFS client not to trust that flag if it comes
from kswapd.

Trond

2010-07-31 10:33:42

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Fri, Jul 30, 2010 at 03:06:01PM -0700, Andrew Morton wrote:
> On Fri, 30 Jul 2010 14:37:00 +0100
> Mel Gorman <[email protected]> wrote:
>
> > There are a number of cases where pages get cleaned but two of concern
> > to this patch are;
> > o When dirtying pages, processes may be throttled to clean pages if
> > dirty_ratio is not met.
>
> Ambiguous. I assume you meant "if dirty_ratio is exceeded".
>

Yes.

> > o Pages belonging to inodes dirtied longer than
> > dirty_writeback_centisecs get cleaned.
> >
> > The problem for reclaim is that dirty pages can reach the end of the LRU if
> > pages are being dirtied slowly so that neither the throttling or a flusher
> > thread waking periodically cleans them.
> >
> > Background flush is already cleaning old or expired inodes first but the
> > expire time is too far in the future at the time of page reclaim. To mitigate
> > future problems, this patch wakes flusher threads to clean 4M of data -
> > an amount that should be manageable without causing congestion in many cases.
> >
> > Ideally, the background flushers would only be cleaning pages belonging
> > to the zone being scanned but it's not clear if this would be of benefit
> > (less IO) or not (potentially less efficient IO if an inode is scattered
> > across multiple zones).
> >
>
> Sigh. We have sooo many problems with writeback and latency. Read
> https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep.

You aren't joking.

> Everyone's
> running away from the issue and here we are adding code to solve some
> alleged stack-overflow problem which seems to be largely a non-problem,
> by making changes which may worsen our real problems.
>

As it is, filesystems are beginnning to ignore writeback from direct
reclaim - such as xfs and btrfs. I'm lead to believe that ext3
effectively ignores writeback from direct reclaim although I don't have
access to code at the moment to double check (am on the road). So either
way, we are going to be facing this problem so the VM might as well be
aware of it :/

> direct-reclaim wants to write a dirty page because that page is in the
> zone which the caller wants to allcoate from! Telling the flusher
> threads to perform generic writeback will sometimes cause them to just
> gum the disk up with pages from different zones, making it even
> harder/slower to allocate a page from the zones we're interested in,
> no?
>

It's a possibility, but it can happen anyway if the filesystem is ignoring
writeback requests from direct reclaim. I considered passing in the zone to
flusher threads to clean nr_pages from a given zone but then worried about
getting caught by the "poor IO pattern" people and what happened if two
zones needed cleaning with a single inodes pages in both.

> If/when that happens, the problem will be rare, subtle, will take a
> long time to get reported and will take years to understand and fix and
> will probably be reported in the monster bug report which everyone's
> hiding from anyway.
>

With the second patch reducing the number of dirty pages encountered by page
reclaim, I'm hoping there will be some impact on latency. I'll be back online
properly Tuesday and will try reproducing some of the problems in that bug
and see can I spot an underlying cause of some sort.

Thanks

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2010-08-01 08:20:08

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

Hi Trond,

> There is that, and then there are issues with the VM simply lying to the
> filesystems.
>
> See https://bugzilla.kernel.org/show_bug.cgi?id=16056
>
> Which basically boils down to the following: kswapd tells the filesystem
> that it is quite safe to do GFP_KERNEL allocations in pageouts and as
> part of try_to_release_page().
>
> In the case of pageouts, it does set the 'WB_SYNC_NONE', 'nonblocking'
> and 'for_reclaim' flags in the writeback_control struct, and so the
> filesystem has at least some hint that it should do non-blocking i/o.
>
> However if you trust the GFP_KERNEL flag in try_to_release_page() then
> the kernel can and will deadlock, and so I had to add in a hack
> specifically to tell the NFS client not to trust that flag if it comes
> from kswapd.

Can you please elaborate your issue more? vmscan logic is, briefly, below

if (PageDirty(page))
pageout(page)
if (page_has_private(page)) {
try_to_release_page(page, sc->gfp_mask))

So, I'm interest why nfs need to writeback at ->release_page again even
though pageout() call ->writepage and it was successfull.

In other word, an argument gfp_mask of try_to_release_page() is suspected
to pass kmalloc()/alloc_page() familiy. and page allocator have already care
PF_MEMALLOC flag.

So, My question is, What do you want additional work to VM folks?
Can you please share nfs design and what we should?

btw, Another question, Recently, Xiaotian Feng posted "swap over nfs -v21"
patch series. they have new reservation memory framework. Is this help you?

2010-08-01 11:16:17

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

> Sigh. We have sooo many problems with writeback and latency. Read
> https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep. Everyone's
> running away from the issue and here we are adding code to solve some
> alleged stack-overflow problem which seems to be largely a non-problem,
> by making changes which may worsen our real problems.

This looks like some vmscan/writeback interaction issue.

Firstly, the CFQ io scheduler can already prevent read IO from being
delayed by lots of ASYNC write IO. See the commits 365722bb/8e2967555
in late 2009.

Reading a big file in an idle system:
680897928 bytes (681 MB) copied, 15.8986 s, 42.8 MB/s

Reading a big file while doing sequential writes to another file:
680897928 bytes (681 MB) copied, 27.6007 s, 24.7 MB/s
680897928 bytes (681 MB) copied, 25.6592 s, 26.5 MB/s

So CFQ offers reasonable read performance under heavy writeback.

Secondly, I can only feel the responsiveness lags when there are
memory pressures _in addition to_ heavy writeback.

cp /dev/zero /tmp

No lags.

usemem 1g --sleep 1000

Still no lags.

usemem 1g --sleep 1000

Still no lags.

usemem 1g --sleep 1000

Begin to feel lags at times. My desktop has 4G memory and no swap
space. So the lags are correlated with page reclaim pressure.

The above symptoms are matched very well by the patches posted by
KOSAKI and me:

- vmscan: raise the bar to PAGEOUT_IO_SYNC stalls
- vmscan: synchronous lumpy reclaim don't call congestion_wait()

However kernels as early as 2.6.18 are reported to have the problem,
so there may be more hidden issues.

Thanks,
Fengguang

2010-08-01 11:56:58

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

> Sigh. We have sooo many problems with writeback and latency. Read
> https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep. Everyone's
> running away from the issue and here we are adding code to solve some
> alleged stack-overflow problem which seems to be largely a non-problem,
> by making changes which may worsen our real problems.

I'm sweeping bug 12309. Most people reports some data writes, though
relative few explicitly stated memory pressure is another necessary
condition.

One interesting report is #3. Thomas reported the same slowdown
_without_ any IO. He was able to narrow down the bug to somewhere
between 2.6.20.21 and 2.6.22.19. I searched through the git and found
a congestion_wait() in commit 232ea4d69d (throttle_vm_writeout():
don't loop on GFP_NOFS and GFP_NOIO allocations) which was later
removed by commit 369f2389e7 (writeback: remove unnecessary wait in
throttle_vm_writeout()).

How can the congestion_wait(HZ/10) be a problem? Because it
unconditionally enters wait loop. So if no IO is underway, it
virtually becomes a schedule_timeout(HZ/10) because there are
no IO completion events to wake it up.

Thanks,
Fengguang

2010-08-01 13:03:36

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Sun, Aug 01, 2010 at 07:56:40PM +0800, Wu Fengguang wrote:
> > Sigh. We have sooo many problems with writeback and latency. Read
> > https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep. Everyone's
> > running away from the issue and here we are adding code to solve some
> > alleged stack-overflow problem which seems to be largely a non-problem,
> > by making changes which may worsen our real problems.
>
> I'm sweeping bug 12309. Most people reports some data writes, though
> relative few explicitly stated memory pressure is another necessary
> condition.

#14: Per von Zweigbergk
Ubuntu 2.6.27 slowdown when copying 25MB/s USB stick to 10 MB/s SSD.

KOSAKI and my patches won't fix 2.6.27, since it only do
congestion_wait() and wait_on_page_writeback() for order>3
allocations. There may be more bugs there.

#24: Per von Zweigbergk
The encryption of the SSD very significantly increases the problem.

This is expected. Data encryption roughly doubles page consumption
speed (there may be temp buffers allocated/dropped quickly), hence
vmscan pressure.

#26: Per von Zweigbergk
Disabling swap makes the terminal launch much faster while copying;
However Firefox and vim hang much more aggressively and frequently
during copying.

It's interesting to see processes behave differently. Is this
reproducible at all?

#34: Ben Gamari
There is evidence that x86-64 is a factor here.

Because x86-64 does order-1 page allocation in fork() and consumes
more memory (larger user space code/data)?

#36: Lari Temmes
Go from usable to totally unusable when switching from
a SMP kernel to a UP kernel on a single CPU laptop

He should be testing 2.6.28. I'm not aware of known bugs there.

#47: xyke
Renicing pdflush -10 had some great improvement on basic
responsiveness.

It sure helps :)

Too much (old) messages there. I'm hoping some of the still active
bug reporters to test the following patches (they are for the -mmotm
tree, need to unindent code for Linus's tree) and see if there are
any improvements.

http://lkml.org/lkml/2010/8/1/40
http://lkml.org/lkml/2010/8/1/45

Thanks,
Fengguang

2010-08-01 16:21:52

by Trond Myklebust

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Sun, 2010-08-01 at 17:19 +0900, KOSAKI Motohiro wrote:
> Hi Trond,
>
> > There is that, and then there are issues with the VM simply lying to the
> > filesystems.
> >
> > See https://bugzilla.kernel.org/show_bug.cgi?id=16056
> >
> > Which basically boils down to the following: kswapd tells the filesystem
> > that it is quite safe to do GFP_KERNEL allocations in pageouts and as
> > part of try_to_release_page().
> >
> > In the case of pageouts, it does set the 'WB_SYNC_NONE', 'nonblocking'
> > and 'for_reclaim' flags in the writeback_control struct, and so the
> > filesystem has at least some hint that it should do non-blocking i/o.
> >
> > However if you trust the GFP_KERNEL flag in try_to_release_page() then
> > the kernel can and will deadlock, and so I had to add in a hack
> > specifically to tell the NFS client not to trust that flag if it comes
> > from kswapd.
>
> Can you please elaborate your issue more? vmscan logic is, briefly, below
>
> if (PageDirty(page))
> pageout(page)
> if (page_has_private(page)) {
> try_to_release_page(page, sc->gfp_mask))
>
> So, I'm interest why nfs need to writeback at ->release_page again even
> though pageout() call ->writepage and it was successfull.
>
> In other word, an argument gfp_mask of try_to_release_page() is suspected
> to pass kmalloc()/alloc_page() familiy. and page allocator have already care
> PF_MEMALLOC flag.
>
> So, My question is, What do you want additional work to VM folks?
> Can you please share nfs design and what we should?
>
>
> btw, Another question, Recently, Xiaotian Feng posted "swap over nfs -v21"
> patch series. they have new reservation memory framework. Is this help you?

The problem that I am seeing is that the try_to_release_page() needs to
be told to act as a non-blocking call when the process is kswapd, just
like the pageout() call.

Currently, the sc->gfp_mask is set to GFP_KERNEL, which normally means
that the call may wait on I/O to complete. However, what I'm seeing in
the bugzilla above is that if kswapd waits on an RPC call, then the
whole VM may gum up: typically, the traces show that the socket layer
cannot allocate memory to hold the RPC reply from the server, and so it
is kicking kswapd to have it reclaim some pages, however kswapd is stuck
in try_to_release_page() waiting for that same I/O to complete, hence
the deadlock...

IOW: I think kswapd at least should be calling try_to_release_page()
with a gfp-flag of '0' to avoid deadlocking on I/O.

Cheers
Trond

2010-08-02 02:30:17

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

Hi Sean,

On Mon, Aug 02, 2010 at 10:17:27AM +0800, Sean Jensen-Grey wrote:
> Wu,
>
> Thank you for doing this. This still bites me on a weekly basis. I don't have much time to test the patches this week, but I should get access to an identical box week after next.

That's OK.

> BTW, I experience the issues even with 8-10GB of free ram. I have 12GB currently.

Thanks for the important information. It means the patches proposed
are not likely to help your case.

In Comment #47 for bug 12309, your kernel 2.6.27 is too old though. You may
well benefit from Jens' CFQ low latency improvements if switching to a recent
kernel.

Thanks,
Fengguang

2010-08-02 07:57:25

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

Hi

> The problem that I am seeing is that the try_to_release_page() needs to
> be told to act as a non-blocking call when the process is kswapd, just
> like the pageout() call.
>
> Currently, the sc->gfp_mask is set to GFP_KERNEL, which normally means
> that the call may wait on I/O to complete. However, what I'm seeing in
> the bugzilla above is that if kswapd waits on an RPC call, then the
> whole VM may gum up: typically, the traces show that the socket layer
> cannot allocate memory to hold the RPC reply from the server, and so it
> is kicking kswapd to have it reclaim some pages, however kswapd is stuck
> in try_to_release_page() waiting for that same I/O to complete, hence
> the deadlock...

Ah, I see. so as far as I understand, you mean
- Socket layer use GFP_ATOMIC, then they don't call try_to_free_pages().
IOW, kswapd is only memory reclaiming thread.
- Kswapd got stuck in ->release_page().
- In usual use case, another thread call kmalloc(GFP_KERNEL) and makes
foreground reclaim, then, restore kswapd stucking. but your case
there is no such thread.

Hm, interesting.

In short term, current nfs fix (checking PF_MEMALLOC in nfs_wb_page())
seems best way. it's no side effect if my understanding is correct.

> IOW: I think kswapd at least should be calling try_to_release_page()
> with a gfp-flag of '0' to avoid deadlocking on I/O.

Hmmm.
0 seems to have very strong meanings rather than nfs required.
There is no reason to prevent grabbing mutex, calling cond_resched() etc etc...

[digging old git history]

Ho hum...

Old commit log says passing gfp-flag=0 break xfs. but current xfs doesn't
use gfp_mask argument. hm.

============================================================
commit 68678e2fc6cfdfd013a2513fe416726f3c05b28d
Author: akpm <akpm>
Date: Tue Sep 10 18:09:08 2002 +0000

[PATCH] pass the correct flags to aops->releasepage()

Restore the gfp_mask in the VM's call to a_ops->releasepage(). We can
block in there again, and XFS (at least) can use that.

BKrev: 3d7e35445skDsKDFM6rdiwTY-5elsw

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ed1ec3..89d801e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -208,7 +208,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
* Otherwise, leave the page on the LRU so it is swappable.
*/
if (PagePrivate(page)) {
- if (!try_to_release_page(page, 0))
+ if (!try_to_release_page(page, gfp_mask))
goto keep_locked;
if (!mapping && page_count(page) == 1)
goto free_it;
============================================================

Now, gfp_mask of try_to_release_page() are used in two place.

btrfs: btrfs_releasepage (check GFP_WAIT)
nfs: nfs_release_page ((gfp & GFP_KERNEL) == GFP_KERNEL)

Probably, btrfs can remove such GFP_WAIT check from try_release_extent_mapping
because it doesn't sleep. I dunno. if so, we can change it to 0 again. but
I'm not sure it has enough worth thing.

Chris, can we hear how btrfs handle gfp_mask argument of release_page()?

btw, VM fokls need more consider kswapd design. now kswapd oftern sleep.
But Trond's bug report says, waiting itself can makes deadlock potentially.
Perhaps it's merely imagine thing. but need to some consider...

2010-08-02 18:31:43

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Sat 31-07-10 11:33:22, Mel Gorman wrote:
> On Fri, Jul 30, 2010 at 03:06:01PM -0700, Andrew Morton wrote:
> > Sigh. We have sooo many problems with writeback and latency. Read
> > https://bugzilla.kernel.org/show_bug.cgi?id=12309 and weep.
>
> You aren't joking.
>
> > Everyone's
> > running away from the issue and here we are adding code to solve some
> > alleged stack-overflow problem which seems to be largely a non-problem,
> > by making changes which may worsen our real problems.
> >
>
> As it is, filesystems are beginnning to ignore writeback from direct
> reclaim - such as xfs and btrfs. I'm lead to believe that ext3
> effectively ignores writeback from direct reclaim although I don't have
> access to code at the moment to double check (am on the road). So either
> way, we are going to be facing this problem so the VM might as well be
> aware of it :/
Umm, ext3 should be handling direct reclaim just fine. ext4 does however
ignore it when a page does not have block already allocated (which is a
common case with delayed allocation).

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2010-08-05 06:45:35

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

sorry for the _very_ delayed review.

> There are a number of cases where pages get cleaned but two of concern
> to this patch are;
> o When dirtying pages, processes may be throttled to clean pages if
> dirty_ratio is not met.
> o Pages belonging to inodes dirtied longer than
> dirty_writeback_centisecs get cleaned.
>
> The problem for reclaim is that dirty pages can reach the end of the LRU if
> pages are being dirtied slowly so that neither the throttling or a flusher
> thread waking periodically cleans them.
>
> Background flush is already cleaning old or expired inodes first but the
> expire time is too far in the future at the time of page reclaim. To mitigate
> future problems, this patch wakes flusher threads to clean 4M of data -
> an amount that should be manageable without causing congestion in many cases.
>
> Ideally, the background flushers would only be cleaning pages belonging
> to the zone being scanned but it's not clear if this would be of benefit
> (less IO) or not (potentially less efficient IO if an inode is scattered
> across multiple zones).
>
> Signed-off-by: Mel Gorman <[email protected]>
> ---
> mm/vmscan.c | 33 +++++++++++++++++++++++++++++++--
> 1 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2d2b588..c4c81bc 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -142,6 +142,18 @@ static DECLARE_RWSEM(shrinker_rwsem);
> /* Direct lumpy reclaim waits up to five seconds for background cleaning */
> #define MAX_SWAP_CLEAN_WAIT 50
>
> +/*
> + * When reclaim encounters dirty data, wakeup flusher threads to clean
> + * a maximum of 4M of data.
> + */
> +#define MAX_WRITEBACK (4194304UL >> PAGE_SHIFT)
> +#define WRITEBACK_FACTOR (MAX_WRITEBACK / SWAP_CLUSTER_MAX)
> +static inline long nr_writeback_pages(unsigned long nr_dirty)
> +{
> + return laptop_mode ? 0 :
> + min(MAX_WRITEBACK, (nr_dirty * WRITEBACK_FACTOR));
> +}

??

As far as I remembered, Hannes pointed out wakeup_flusher_threads(0) is
incorrect. can you fix this?

> +
> static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> struct scan_control *sc)
> {
> @@ -649,12 +661,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
> static unsigned long shrink_page_list(struct list_head *page_list,
> struct scan_control *sc,
> enum pageout_io sync_writeback,
> + int file,
> unsigned long *nr_still_dirty)
> {
> LIST_HEAD(ret_pages);
> LIST_HEAD(free_pages);
> int pgactivate = 0;
> unsigned long nr_dirty = 0;
> + unsigned long nr_dirty_seen = 0;
> unsigned long nr_reclaimed = 0;
>
> cond_resched();
> @@ -748,6 +762,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> }
>
> if (PageDirty(page)) {
> + nr_dirty_seen++;
> +
> /*
> * Only kswapd can writeback filesystem pages to
> * avoid risk of stack overflow
> @@ -875,6 +891,18 @@ keep:
>
> list_splice(&ret_pages, page_list);
>
> + /*
> + * If reclaim is encountering dirty pages, it may be because
> + * dirty pages are reaching the end of the LRU even though the
> + * dirty_ratio may be satisified. In this case, wake flusher
> + * threads to pro-actively clean up to a maximum of
> + * 4 * SWAP_CLUSTER_MAX amount of data (usually 1/2MB) unless
> + * !may_writepage indicates that this is a direct reclaimer in
> + * laptop mode avoiding disk spin-ups
> + */
> + if (file && nr_dirty_seen && sc->may_writepage)
> + wakeup_flusher_threads(nr_writeback_pages(nr_dirty));

Umm..
I don't think this guessing is so acculate. following is brief of
current isolate_lru_pages().

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
unsigned long *scanned, int order, int mode, int file)
{
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
__isolate_lru_page(page, mode, file))

if (!order)
continue;

/*
* Attempt to take all pages in the order aligned region
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
* as the mem_map is guarenteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
for (; pfn < end_pfn; pfn++) {
struct page *cursor_page;
(snip)
}

(This was unchanged since initial lumpy reclaim commit)

That said, merely order-1 isolate_lru_pages(ISOLATE_INACTIVE) makes pfn
neighbor search. then, we might found dirty pages even though the page
don't stay in end of lru.

What do you think?

> +
> *nr_still_dirty = nr_dirty;
> count_vm_events(PGACTIVATE, pgactivate);
> return nr_reclaimed;
> @@ -1315,7 +1343,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
> spin_unlock_irq(&zone->lru_lock);
>
> nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC,
> - &nr_dirty);
> + file, &nr_dirty);
>
> /*
> * If specific pages are needed such as with direct reclaiming
> @@ -1351,7 +1379,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
> count_vm_events(PGDEACTIVATE, nr_active);
>
> nr_reclaimed += shrink_page_list(&page_list, sc,
> - PAGEOUT_IO_SYNC, &nr_dirty);
> + PAGEOUT_IO_SYNC, file,
> + &nr_dirty);
> }
> }
>
> --
> 1.7.1
>

2010-08-05 06:59:46

[permalink] [raw]

Subject: Re: [PATCH 5/6] vmscan: Do not writeback filesystem pages in direct reclaim

again, very sorry for the delay.

> When memory is under enough pressure, a process may enter direct
> reclaim to free pages in the same manner kswapd does. If a dirty page is
> encountered during the scan, this page is written to backing storage using
> mapping->writepage. This can result in very deep call stacks, particularly
> if the target storage or filesystem are complex. It has already been observed
> on XFS that the stack overflows but the problem is not XFS-specific.
>
> This patch prevents direct reclaim writing back filesystem pages by checking
> if current is kswapd or the page is anonymous before writing back. If the
> dirty pages cannot be written back, they are placed back on the LRU lists
> for either background writing by the BDI threads or kswapd. If in direct
> lumpy reclaim and dirty pages are encountered, the process will stall for
> the background flusher before trying to reclaim the pages again.
>
> As the call-chain for writing anonymous pages is not expected to be deep
> and they are not cleaned by flusher threads, anonymous pages are still
> written back in direct reclaim.
>
> Signed-off-by: Mel Gorman <[email protected]>
> Acked-by: Rik van Riel <[email protected]>
> Reviewed-by: Johannes Weiner <[email protected]>
> ---
> mm/vmscan.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++------------
> 1 files changed, 54 insertions(+), 15 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d83812a..2d2b588 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -139,6 +139,9 @@ static DECLARE_RWSEM(shrinker_rwsem);
> #define scanning_global_lru(sc) (1)
> #endif
>
> +/* Direct lumpy reclaim waits up to five seconds for background cleaning */
> +#define MAX_SWAP_CLEAN_WAIT 50
> +
> static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> struct scan_control *sc)
> {
> @@ -645,11 +648,13 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
> */
> static unsigned long shrink_page_list(struct list_head *page_list,
> struct scan_control *sc,
> - enum pageout_io sync_writeback)
> + enum pageout_io sync_writeback,
> + unsigned long *nr_still_dirty)
> {
> LIST_HEAD(ret_pages);
> LIST_HEAD(free_pages);
> int pgactivate = 0;
> + unsigned long nr_dirty = 0;
> unsigned long nr_reclaimed = 0;
>
> cond_resched();
> @@ -743,6 +748,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> }
>
> if (PageDirty(page)) {
> + /*
> + * Only kswapd can writeback filesystem pages to
> + * avoid risk of stack overflow
> + */
> + if (page_is_file_cache(page) && !current_is_kswapd()) {
> + nr_dirty++;
> + goto keep_locked;
> + }
> +
> if (references == PAGEREF_RECLAIM_CLEAN)
> goto keep_locked;
> if (!may_enter_fs)
> @@ -860,6 +874,8 @@ keep:
> free_page_list(&free_pages);
>
> list_splice(&ret_pages, page_list);
> +
> + *nr_still_dirty = nr_dirty;
> count_vm_events(PGACTIVATE, pgactivate);
> return nr_reclaimed;
> }
> @@ -1242,12 +1258,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
> struct scan_control *sc, int priority, int file)
> {
> LIST_HEAD(page_list);
> + LIST_HEAD(putback_list);
> unsigned long nr_scanned;
> unsigned long nr_reclaimed = 0;
> unsigned long nr_taken;
> unsigned long nr_active;
> unsigned long nr_anon;
> unsigned long nr_file;
> + unsigned long nr_dirty;
>
> while (unlikely(too_many_isolated(zone, file, sc))) {
> congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -1296,28 +1314,49 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
>
> spin_unlock_irq(&zone->lru_lock);
>
> - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC,
> + &nr_dirty);
>
> /*
> - * If we are direct reclaiming for contiguous pages and we do
> + * If specific pages are needed such as with direct reclaiming
> + * for contiguous pages or for memory containers and we do
> * not reclaim everything in the list, try again and wait
> - * for IO to complete. This will stall high-order allocations
> - * but that should be acceptable to the caller
> + * for IO to complete. This will stall callers that require
> + * specific pages but it should be acceptable to the caller
> */
> - if (nr_reclaimed < nr_taken && !current_is_kswapd() &&
> - sc->lumpy_reclaim_mode) {
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> + if (sc->may_writepage && !current_is_kswapd() &&
> + (sc->lumpy_reclaim_mode || sc->mem_cgroup)) {
> + int dirty_retry = MAX_SWAP_CLEAN_WAIT;
>
> - /*
> - * The attempt at page out may have made some
> - * of the pages active, mark them inactive again.
> - */
> - nr_active = clear_active_flags(&page_list, NULL);
> - count_vm_events(PGDEACTIVATE, nr_active);
> + while (nr_reclaimed < nr_taken && nr_dirty && dirty_retry--) {
> + struct page *page, *tmp;
> +
> + /* Take off the clean pages marked for activation */
> + list_for_each_entry_safe(page, tmp, &page_list, lru) {
> + if (PageDirty(page) || PageWriteback(page))
> + continue;
> +
> + list_del(&page->lru);
> + list_add(&page->lru, &putback_list);
> + }
> +
> + wakeup_flusher_threads(laptop_mode ? 0 : nr_dirty);

ditto.
wakeup_flusher_threads(0) is not correct?

And, When flusher thread still don't start IO, this loop don't have proper
waiting. do we need wait_on_page_dirty() or something?
(similar wait_on_page_writeback)

> + congestion_wait(BLK_RW_ASYNC, HZ/10);

As we discussed, congestion_wait() don't works find if slow strage device
is connected.

>
> - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
> + /*
> + * The attempt at page out may have made some
> + * of the pages active, mark them inactive again.
> + */
> + nr_active = clear_active_flags(&page_list, NULL);
> + count_vm_events(PGDEACTIVATE, nr_active);
> +
> + nr_reclaimed += shrink_page_list(&page_list, sc,
> + PAGEOUT_IO_SYNC, &nr_dirty);

After my patch, when PAGEOUT_IO_SYNC failure, retry is no good idea.
can we remove this loop?

> + }
> }
>
> + list_splice(&putback_list, &page_list);
> +
> local_irq_disable();
> if (current_is_kswapd())
> __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
> --
> 1.7.1
>

2010-08-05 14:10:06

[permalink] [raw]

Subject: Re: [PATCH 6/6] vmscan: Kick flusher threads to clean pages when reclaim is encountering dirty pages

On Thu, Aug 05, 2010 at 03:45:24PM +0900, KOSAKI Motohiro wrote:
>
> sorry for the _very_ delayed review.
>

Not to worry.

> > <SNIP>
> > +/*
> > + * When reclaim encounters dirty data, wakeup flusher threads to clean
> > + * a maximum of 4M of data.
> > + */
> > +#define MAX_WRITEBACK (4194304UL >> PAGE_SHIFT)
> > +#define WRITEBACK_FACTOR (MAX_WRITEBACK / SWAP_CLUSTER_MAX)
> > +static inline long nr_writeback_pages(unsigned long nr_dirty)
> > +{
> > + return laptop_mode ? 0 :
> > + min(MAX_WRITEBACK, (nr_dirty * WRITEBACK_FACTOR));
> > +}
>
> ??
>
> As far as I remembered, Hannes pointed out wakeup_flusher_threads(0) is
> incorrect. can you fix this?
>

It's behaving as it should, see http://lkml.org/lkml/2010/7/20/151

>
>
> > +
> > static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> > struct scan_control *sc)
> > {
> > @@ -649,12 +661,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
> > static unsigned long shrink_page_list(struct list_head *page_list,
> > struct scan_control *sc,
> > enum pageout_io sync_writeback,
> > + int file,
> > unsigned long *nr_still_dirty)
> > {
> > LIST_HEAD(ret_pages);
> > LIST_HEAD(free_pages);
> > int pgactivate = 0;
> > unsigned long nr_dirty = 0;
> > + unsigned long nr_dirty_seen = 0;
> > unsigned long nr_reclaimed = 0;
> >
> > cond_resched();
> > @@ -748,6 +762,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > }
> >
> > if (PageDirty(page)) {
> > + nr_dirty_seen++;
> > +
> > /*
> > * Only kswapd can writeback filesystem pages to
> > * avoid risk of stack overflow
> > @@ -875,6 +891,18 @@ keep:
> >
> > list_splice(&ret_pages, page_list);
> >
> > + /*
> > + * If reclaim is encountering dirty pages, it may be because
> > + * dirty pages are reaching the end of the LRU even though the
> > + * dirty_ratio may be satisified. In this case, wake flusher
> > + * threads to pro-actively clean up to a maximum of
> > + * 4 * SWAP_CLUSTER_MAX amount of data (usually 1/2MB) unless
> > + * !may_writepage indicates that this is a direct reclaimer in
> > + * laptop mode avoiding disk spin-ups
> > + */
> > + if (file && nr_dirty_seen && sc->may_writepage)
> > + wakeup_flusher_threads(nr_writeback_pages(nr_dirty));
>
> Umm..
> I don't think this guessing is so acculate. following is brief of
> current isolate_lru_pages().
>
>
> static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> struct list_head *src, struct list_head *dst,
> unsigned long *scanned, int order, int mode, int file)
> {
> for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
> __isolate_lru_page(page, mode, file))
>
> if (!order)
> continue;
>
> /*
> * Attempt to take all pages in the order aligned region
> * surrounding the tag page. Only take those pages of
> * the same active state as that tag page. We may safely
> * round the target page pfn down to the requested order
> * as the mem_map is guarenteed valid out to MAX_ORDER,
> * where that page is in a different zone we will detect
> * it from its zone id and abort this block scan.
> */
> for (; pfn < end_pfn; pfn++) {
> struct page *cursor_page;
> (snip)
> }
>
> (This was unchanged since initial lumpy reclaim commit)
>

I think what you are pointing out is that when lumpy-reclaiming from the anon
LRU, there may be file pages on the page_list being shrinked. In that case, we
might miss an opportunity to wake the flusher threads when it was appropriate.

Is that accurate or have you another concern?

> That said, merely order-1 isolate_lru_pages(ISOLATE_INACTIVE) makes pfn
> neighbor search. then, we might found dirty pages even though the page
> don't stay in end of lru.
>
> What do you think?
>

For low-order lumpy reclaim, I think it should only be necessary to wake
the flusher threads when scanning the file LRU. While there may be file
pages lumpy reclaimed while scanning the anon list, I think we would
have to show it was a common and real problem before adding the
necessary accounting and checks.

>
> > +
> > *nr_still_dirty = nr_dirty;
> > count_vm_events(PGACTIVATE, pgactivate);
> > return nr_reclaimed;
> > @@ -1315,7 +1343,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
> > spin_unlock_irq(&zone->lru_lock);
> >
> > nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC,
> > - &nr_dirty);
> > + file, &nr_dirty);
> >
> > /*
> > * If specific pages are needed such as with direct reclaiming
> > @@ -1351,7 +1379,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
> > count_vm_events(PGDEACTIVATE, nr_active);
> >
> > nr_reclaimed += shrink_page_list(&page_list, sc,
> > - PAGEOUT_IO_SYNC, &nr_dirty);
> > + PAGEOUT_IO_SYNC, file,
> > + &nr_dirty);
> > }
> > }
> >
> > --
> > 1.7.1
> >
>
>
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2010-08-05 14:16:05