This patch depend on "vmscan : simplify code" patch (written by Huang Shijie)
=========================================
Now, max_scan of shrink_inactive_list() is always passed less than
SWAP_CLUSTER_MAX. then, we can remove scanning pages loop in it.
detail
- remove "while (nr_scanned < max_scan)" loop
- remove nr_freed variable (now, we use nr_reclaimed directly)
- remove nr_scan variable (now, we use nr_scanned directly)
- rename max_scan to nr_to_scan
- pass nr_to_scan into isolate_pages() directly instead
using SWAP_CLUSTER_MAX
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 194 ++++++++++++++++++++++++++++-------------------------------
1 files changed, 91 insertions(+), 103 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7536991..a58ff15 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1085,16 +1085,22 @@ static int too_many_isolated(struct zone *zone, int file,
* shrink_inactive_list() is a helper for shrink_zone(). It returns the number
* of reclaimed pages
*/
-static unsigned long shrink_inactive_list(unsigned long max_scan,
- struct zone *zone, struct scan_control *sc,
- int priority, int file)
+static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
+ struct zone *zone, struct scan_control *sc,
+ int priority, int file)
{
LIST_HEAD(page_list);
struct pagevec pvec;
- unsigned long nr_scanned = 0;
+ unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
int lumpy_reclaim = 0;
+ struct page *page;
+ unsigned long nr_taken;
+ unsigned long nr_active;
+ unsigned int count[NR_LRU_LISTS] = { 0, };
+ unsigned long nr_anon;
+ unsigned long nr_file;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1120,119 +1126,101 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- do {
- struct page *page;
- unsigned long nr_taken;
- unsigned long nr_scan;
- unsigned long nr_freed;
- unsigned long nr_active;
- unsigned int count[NR_LRU_LISTS] = { 0, };
- int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
- unsigned long nr_anon;
- unsigned long nr_file;
-
- nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
- &page_list, &nr_scan, sc->order, mode,
- zone, sc->mem_cgroup, 0, file);
+ nr_taken = sc->isolate_pages(nr_to_scan,
+ &page_list, &nr_scanned, sc->order,
+ lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup, 0, file);
- if (scanning_global_lru(sc)) {
- zone->pages_scanned += nr_scan;
- if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone,
- nr_scan);
- else
- __count_zone_vm_events(PGSCAN_DIRECT, zone,
- nr_scan);
- }
+ if (scanning_global_lru(sc)) {
+ zone->pages_scanned += nr_scanned;
+ if (current_is_kswapd())
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+ }
- if (nr_taken == 0)
- goto done;
+ if (nr_taken == 0)
+ goto done;
- nr_active = clear_active_flags(&page_list, count);
- __count_vm_events(PGDEACTIVATE, nr_active);
+ nr_active = clear_active_flags(&page_list, count);
+ __count_vm_events(PGDEACTIVATE, nr_active);
- __mod_zone_page_state(zone, NR_ACTIVE_FILE,
- -count[LRU_ACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_INACTIVE_FILE,
- -count[LRU_INACTIVE_FILE]);
- __mod_zone_page_state(zone, NR_ACTIVE_ANON,
- -count[LRU_ACTIVE_ANON]);
- __mod_zone_page_state(zone, NR_INACTIVE_ANON,
- -count[LRU_INACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+ -count[LRU_ACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+ -count[LRU_INACTIVE_FILE]);
+ __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+ -count[LRU_ACTIVE_ANON]);
+ __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+ -count[LRU_INACTIVE_ANON]);
- nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
+ nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
- reclaim_stat->recent_scanned[0] += nr_anon;
- reclaim_stat->recent_scanned[1] += nr_file;
+ reclaim_stat->recent_scanned[0] += nr_anon;
+ reclaim_stat->recent_scanned[1] += nr_file;
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&zone->lru_lock);
- nr_scanned += nr_scan;
- nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+ /*
+ * If we are direct reclaiming for contiguous pages and we do
+ * not reclaim everything in the list, try again and wait
+ * for IO to complete. This will stall high-order allocations
+ * but that should be acceptable to the caller
+ */
+ if (nr_reclaimed < nr_taken && !current_is_kswapd() && lumpy_reclaim) {
+ congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
- * If we are direct reclaiming for contiguous pages and we do
- * not reclaim everything in the list, try again and wait
- * for IO to complete. This will stall high-order allocations
- * but that should be acceptable to the caller
+ * The attempt at page out may have made some
+ * of the pages active, mark them inactive again.
*/
- if (nr_freed < nr_taken && !current_is_kswapd() &&
- lumpy_reclaim) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
-
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, count);
- count_vm_events(PGDEACTIVATE, nr_active);
-
- nr_freed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
- }
+ nr_active = clear_active_flags(&page_list, count);
+ count_vm_events(PGDEACTIVATE, nr_active);
- nr_reclaimed += nr_freed;
+ nr_reclaimed += shrink_page_list(&page_list, sc,
+ PAGEOUT_IO_SYNC);
+ }
- local_irq_disable();
- if (current_is_kswapd())
- __count_vm_events(KSWAPD_STEAL, nr_freed);
- __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+ local_irq_disable();
+ if (current_is_kswapd())
+ __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
- spin_lock(&zone->lru_lock);
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(&page_list)) {
- int lru;
- page = lru_to_page(&page_list);
- VM_BUG_ON(PageLRU(page));
- list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
- spin_unlock_irq(&zone->lru_lock);
- putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
- continue;
- }
- SetPageLRU(page);
- lru = page_lru(page);
- add_page_to_lru_list(zone, page, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
- }
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
+ spin_lock(&zone->lru_lock);
+ /*
+ * Put back any unfreeable pages.
+ */
+ while (!list_empty(&page_list)) {
+ int lru;
+ page = lru_to_page(&page_list);
+ VM_BUG_ON(PageLRU(page));
+ list_del(&page->lru);
+ if (unlikely(!page_evictable(page, NULL))) {
+ spin_unlock_irq(&zone->lru_lock);
+ putback_lru_page(page);
+ spin_lock_irq(&zone->lru_lock);
+ continue;
}
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
-
- } while (nr_scanned < max_scan);
+ SetPageLRU(page);
+ lru = page_lru(page);
+ add_page_to_lru_list(zone, page, lru);
+ if (is_active_lru(lru)) {
+ int file = is_file_lru(lru);
+ reclaim_stat->recent_rotated[file]++;
+ }
+ if (!pagevec_add(&pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __pagevec_release(&pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+ }
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
done:
spin_unlock_irq(&zone->lru_lock);
--
1.6.5.2
Current lru calculation (e.g. LRU_ACTIVE + file * LRU_FILE) is a bit
ugly.
To make helper function improve code readability a bit.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 25 ++++++++++++++-----------
1 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a58ff15..7e0245d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -156,6 +156,16 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
return zone_page_state(zone, NR_LRU_BASE + lru);
}
+static inline enum lru_list lru_index(int active, int file)
+{
+ int lru = LRU_BASE;
+ if (active)
+ lru += LRU_ACTIVE;
+ if (file)
+ lru += LRU_FILE;
+
+ return lru;
+}
/*
* Add a shrinker callback to be called from the vm
@@ -978,13 +988,8 @@ static unsigned long isolate_pages_global(unsigned long nr,
struct mem_cgroup *mem_cont,
int active, int file)
{
- int lru = LRU_BASE;
- if (active)
- lru += LRU_ACTIVE;
- if (file)
- lru += LRU_FILE;
- return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
- mode, file);
+ return isolate_lru_pages(nr, &z->lru[lru_index(active, file)].list,
+ dst, scanned, order, mode, file);
}
/*
@@ -1373,10 +1378,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- move_active_pages_to_lru(zone, &l_active,
- LRU_ACTIVE + file * LRU_FILE);
- move_active_pages_to_lru(zone, &l_inactive,
- LRU_BASE + file * LRU_FILE);
+ move_active_pages_to_lru(zone, &l_active, lru_index(1, file));
+ move_active_pages_to_lru(zone, &l_inactive, lru_index(0, file));
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
}
--
1.6.5.2
Pgmoved accounting in move_active_pages_to_lru() doesn't make any sense.
it can be calculated in irq enabled area.
This patch move #-of-deactivating-pages calcution to shrink_active_list().
Fortunatelly, it also kill one branch.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
mm/vmscan.c | 18 ++++++++++++------
1 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7e0245d..56faefb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -167,6 +167,11 @@ static inline enum lru_list lru_index(int active, int file)
return lru;
}
+static inline int lru_stat_index(int active, int file)
+{
+ return lru_index(active, file) + NR_LRU_BASE;
+}
+
/*
* Add a shrinker callback to be called from the vm
*/
@@ -1269,7 +1274,6 @@ static void move_active_pages_to_lru(struct zone *zone,
struct list_head *list,
enum lru_list lru)
{
- unsigned long pgmoved = 0;
struct pagevec pvec;
struct page *page;
@@ -1283,7 +1287,6 @@ static void move_active_pages_to_lru(struct zone *zone,
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1293,9 +1296,6 @@ static void move_active_pages_to_lru(struct zone *zone,
spin_lock_irq(&zone->lru_lock);
}
}
- __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
- if (!is_active_lru(lru))
- __count_vm_events(PGDEACTIVATE, pgmoved);
}
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
@@ -1310,6 +1310,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ unsigned long nr_deactivated = 0;
+ unsigned long nr_reactivated = 0;
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
@@ -1358,12 +1360,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
+ nr_reactivated++;
continue;
}
}
ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);
+ nr_deactivated++;
}
/*
@@ -1377,9 +1381,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
-
move_active_pages_to_lru(zone, &l_active, lru_index(1, file));
move_active_pages_to_lru(zone, &l_inactive, lru_index(0, file));
+ __count_vm_events(PGDEACTIVATE, nr_deactivated);
+ __mod_zone_page_state(zone, lru_stat_index(1, file), nr_reactivated);
+ __mod_zone_page_state(zone, lru_stat_index(0, file), nr_deactivated);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
}
--
1.6.5.2
note: Last year, Andy Whitcroft reported pcp prevent to make contenious
high order page when lumpy reclaim is running.
He posted "capture pages freed during direct reclaim for allocation by the reclaimer"
patch series, but Christoph mentioned simple bypass pcp instead.
I made it. I'd hear Christoph and Mel's mention.
==========================
Currently vmscan free unused pages by __pagevec_free(). It mean free pages one by one
and use pcp. it makes two suboptimal result.
- The another task can steal the freed page in pcp easily. it decrease
lumpy reclaim worth.
- To pollute pcp cache, vmscan freed pages might kick out cache hot
pages from pcp.
This patch make new free_pages_bulk() function and vmscan use it.
Signed-off-by: KOSAKI Motohiro <[email protected]>
---
include/linux/gfp.h | 2 +
mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 23 +++++++++++----------
3 files changed, 70 insertions(+), 11 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f53e9b8..403584d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -330,6 +330,8 @@ extern void free_hot_page(struct page *page);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)
+void free_pages_bulk(struct zone *zone, int count, struct list_head *list);
+
void page_alloc_init(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 11ae66e..f77f8a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2037,6 +2037,62 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+/*
+ * Frees a number of pages from the list
+ * Assumes all pages on list are in same zone and order==0.
+ * count is the number of pages to free.
+ *
+ * This is similar to __pagevec_free(), but receive list instead pagevec.
+ * and this don't use pcp cache. it is good characteristics for vmscan.
+ */
+void free_pages_bulk(struct zone *zone, int count, struct list_head *list)
+{
+ unsigned long flags;
+ struct page *page;
+ struct page *page2;
+
+ list_for_each_entry_safe(page, page2, list, lru) {
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ kmemcheck_free_shadow(page, 0);
+
+ if (PageAnon(page))
+ page->mapping = NULL;
+ if (free_pages_check(page)) {
+ /* orphan this page. */
+ list_del(&page->lru);
+ continue;
+ }
+ if (!PageHighMem(page)) {
+ debug_check_no_locks_freed(page_address(page),
+ PAGE_SIZE);
+ debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
+ }
+ arch_free_page(page, 0);
+ kernel_map_pages(page, 1, 0);
+
+ local_irq_save(flags);
+ if (unlikely(wasMlocked))
+ free_page_mlock(page);
+ local_irq_restore(flags);
+ }
+
+ spin_lock_irqsave(&zone->lock, flags);
+ __count_vm_events(PGFREE, count);
+ zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+ zone->pages_scanned = 0;
+
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
+
+ list_for_each_entry_safe(page, page2, list, lru) {
+ /* have to delete it as __free_one_page list manipulates */
+ list_del(&page->lru);
+ trace_mm_page_free_direct(page, 0);
+ __free_one_page(page, zone, 0, page_private(page));
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56faefb..00156f2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -598,18 +598,17 @@ redo:
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
+ struct list_head *freed_pages_list,
struct scan_control *sc,
enum pageout_io sync_writeback)
{
LIST_HEAD(ret_pages);
- struct pagevec freed_pvec;
int pgactivate = 0;
unsigned long nr_reclaimed = 0;
unsigned long vm_flags;
cond_resched();
- pagevec_init(&freed_pvec, 1);
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -785,10 +784,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
__clear_page_locked(page);
free_it:
nr_reclaimed++;
- if (!pagevec_add(&freed_pvec, page)) {
- __pagevec_free(&freed_pvec);
- pagevec_reinit(&freed_pvec);
- }
+ list_add(&page->lru, freed_pages_list);
continue;
cull_mlocked:
@@ -812,8 +808,6 @@ keep:
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
list_splice(&ret_pages, page_list);
- if (pagevec_count(&freed_pvec))
- __pagevec_free(&freed_pvec);
count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1100,6 +1094,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
int priority, int file)
{
LIST_HEAD(page_list);
+ LIST_HEAD(freed_pages_list);
struct pagevec pvec;
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
@@ -1174,7 +1169,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ nr_reclaimed = shrink_page_list(&page_list, &freed_pages_list, sc,
+ PAGEOUT_IO_ASYNC);
/*
* If we are direct reclaiming for contiguous pages and we do
@@ -1192,10 +1188,15 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
nr_active = clear_active_flags(&page_list, count);
count_vm_events(PGDEACTIVATE, nr_active);
- nr_reclaimed += shrink_page_list(&page_list, sc,
- PAGEOUT_IO_SYNC);
+ nr_reclaimed += shrink_page_list(&page_list, &freed_pages_list,
+ sc, PAGEOUT_IO_SYNC);
}
+ /*
+ * Free unused pages.
+ */
+ free_pages_bulk(zone, nr_reclaimed, &freed_pages_list);
+
local_irq_disable();
if (current_is_kswapd())
__count_vm_events(KSWAPD_STEAL, nr_reclaimed);
--
1.6.5.2
Hi KOSAKI,
On Fri, 27 Nov 2009, KOSAKI Motohiro wrote:
> Current lru calculation (e.g. LRU_ACTIVE + file * LRU_FILE) is a bit
> ugly.
> To make helper function improve code readability a bit.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/vmscan.c | 25 ++++++++++++++-----------
> 1 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a58ff15..7e0245d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -156,6 +156,16 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
> return zone_page_state(zone, NR_LRU_BASE + lru);
> }
>
> +static inline enum lru_list lru_index(int active, int file)
> +{
> + int lru = LRU_BASE;
> + if (active)
> + lru += LRU_ACTIVE;
> + if (file)
> + lru += LRU_FILE;
> +
> + return lru;
> +}
>
> @@ -1373,10 +1378,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> */
> reclaim_stat->recent_rotated[file] += nr_rotated;
>
> - move_active_pages_to_lru(zone, &l_active,
> - LRU_ACTIVE + file * LRU_FILE);
> - move_active_pages_to_lru(zone, &l_inactive,
> - LRU_BASE + file * LRU_FILE);
> + move_active_pages_to_lru(zone, &l_active, lru_index(1, file));
> + move_active_pages_to_lru(zone, &l_inactive, lru_index(0, file));
How about:
move_active_pages_to_lru(zone, &l_active, lru_index(LRU_ACTIVE, file));
move_active_pages_to_lru(zone, &l_inactive, lru_index(LRU_BASE, file));
?
> > @@ -1373,10 +1378,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > */
> > reclaim_stat->recent_rotated[file] += nr_rotated;
> >
> > - move_active_pages_to_lru(zone, &l_active,
> > - LRU_ACTIVE + file * LRU_FILE);
> > - move_active_pages_to_lru(zone, &l_inactive,
> > - LRU_BASE + file * LRU_FILE);
> > + move_active_pages_to_lru(zone, &l_active, lru_index(1, file));
> > + move_active_pages_to_lru(zone, &l_inactive, lru_index(0, file));
>
> How about:
> move_active_pages_to_lru(zone, &l_active, lru_index(LRU_ACTIVE, file));
> move_active_pages_to_lru(zone, &l_inactive, lru_index(LRU_BASE, file));
> ?
No. lru_index mean convert two boolean to one index. it shoudn't be passed
lru index itself.
On Fri, 27 Nov 2009, KOSAKI Motohiro wrote:
> > > @@ -1373,10 +1378,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > > */
> > > reclaim_stat->recent_rotated[file] += nr_rotated;
> > >
> > > - move_active_pages_to_lru(zone, &l_active,
> > > - LRU_ACTIVE + file * LRU_FILE);
> > > - move_active_pages_to_lru(zone, &l_inactive,
> > > - LRU_BASE + file * LRU_FILE);
> > > + move_active_pages_to_lru(zone, &l_active, lru_index(1, file));
> > > + move_active_pages_to_lru(zone, &l_inactive, lru_index(0, file));
> >
> > How about:
> > move_active_pages_to_lru(zone, &l_active, lru_index(LRU_ACTIVE, file));
> > move_active_pages_to_lru(zone, &l_inactive, lru_index(LRU_BASE, file));
> > ?
>
> No. lru_index mean convert two boolean to one index. it shoudn't be passed
> lru index itself.
Yeah, you are right, I immediately realized I am wrong after replying,
sorry.
Vincent
On Fri, Nov 27, 2009 at 09:23:57AM +0900, KOSAKI Motohiro wrote:
>
> note: Last year, Andy Whitcroft reported pcp prevent to make contenious
> high order page when lumpy reclaim is running.
I don't remember the specifics of the discussion but I know that when
that patch series was being prototyped, it was because order-0
allocations were racing with lumpy reclaimers. A lumpy reclaim might
free up an order-9 page say but while it was freeing, an order-0 page
would be allocated from the middle. It wasn't the PCP lists as such that
were a problem once they were getting drained as part of a high-order
allocation attempt. It would be just as bad if the order-0 page was
taken from the buddy lists.
> He posted "capture pages freed during direct reclaim for allocation by the reclaimer"
> patch series, but Christoph mentioned simple bypass pcp instead.
> I made it. I'd hear Christoph and Mel's mention.
>
> ==========================
> Currently vmscan free unused pages by __pagevec_free(). It mean free pages one by one
> and use pcp. it makes two suboptimal result.
>
> - The another task can steal the freed page in pcp easily. it decrease
> lumpy reclaim worth.
> - To pollute pcp cache, vmscan freed pages might kick out cache hot
> pages from pcp.
>
The latter point is interesting.
> This patch make new free_pages_bulk() function and vmscan use it.
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> include/linux/gfp.h | 2 +
> mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
> mm/vmscan.c | 23 +++++++++++----------
> 3 files changed, 70 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index f53e9b8..403584d 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -330,6 +330,8 @@ extern void free_hot_page(struct page *page);
> #define __free_page(page) __free_pages((page), 0)
> #define free_page(addr) free_pages((addr),0)
>
> +void free_pages_bulk(struct zone *zone, int count, struct list_head *list);
> +
> void page_alloc_init(void);
> void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
> void drain_all_pages(void);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 11ae66e..f77f8a8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2037,6 +2037,62 @@ void free_pages(unsigned long addr, unsigned int order)
>
> EXPORT_SYMBOL(free_pages);
>
> +/*
> + * Frees a number of pages from the list
> + * Assumes all pages on list are in same zone and order==0.
> + * count is the number of pages to free.
> + *
> + * This is similar to __pagevec_free(), but receive list instead pagevec.
> + * and this don't use pcp cache. it is good characteristics for vmscan.
> + */
> +void free_pages_bulk(struct zone *zone, int count, struct list_head *list)
> +{
> + unsigned long flags;
> + struct page *page;
> + struct page *page2;
> +
> + list_for_each_entry_safe(page, page2, list, lru) {
> + int wasMlocked = __TestClearPageMlocked(page);
> +
> + kmemcheck_free_shadow(page, 0);
> +
> + if (PageAnon(page))
> + page->mapping = NULL;
> + if (free_pages_check(page)) {
> + /* orphan this page. */
> + list_del(&page->lru);
> + continue;
> + }
> + if (!PageHighMem(page)) {
> + debug_check_no_locks_freed(page_address(page),
> + PAGE_SIZE);
> + debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
> + }
> + arch_free_page(page, 0);
> + kernel_map_pages(page, 1, 0);
> +
> + local_irq_save(flags);
> + if (unlikely(wasMlocked))
> + free_page_mlock(page);
> + local_irq_restore(flags);
> + }
> +
> + spin_lock_irqsave(&zone->lock, flags);
> + __count_vm_events(PGFREE, count);
> + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
> + zone->pages_scanned = 0;
> +
> + __mod_zone_page_state(zone, NR_FREE_PAGES, count);
> +
> + list_for_each_entry_safe(page, page2, list, lru) {
> + /* have to delete it as __free_one_page list manipulates */
> + list_del(&page->lru);
> + trace_mm_page_free_direct(page, 0);
> + __free_one_page(page, zone, 0, page_private(page));
> + }
> + spin_unlock_irqrestore(&zone->lock, flags);
> +}
It would be preferable that the bulk free code would use as much of the
existing free logic in the page allocator as possible. This is making a
lot of checks that are done elsewhere. As this is an RFC, it's not
critical but worth bearing in mind.
> +
> /**
> * alloc_pages_exact - allocate an exact number physically-contiguous pages.
> * @size: the number of bytes to allocate
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 56faefb..00156f2 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -598,18 +598,17 @@ redo:
> * shrink_page_list() returns the number of reclaimed pages
> */
> static unsigned long shrink_page_list(struct list_head *page_list,
> + struct list_head *freed_pages_list,
> struct scan_control *sc,
Should the freed_pages_list be part of scan_control?
> enum pageout_io sync_writeback)
> {
> LIST_HEAD(ret_pages);
> - struct pagevec freed_pvec;
> int pgactivate = 0;
> unsigned long nr_reclaimed = 0;
> unsigned long vm_flags;
>
> cond_resched();
>
> - pagevec_init(&freed_pvec, 1);
> while (!list_empty(page_list)) {
> struct address_space *mapping;
> struct page *page;
> @@ -785,10 +784,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> __clear_page_locked(page);
> free_it:
> nr_reclaimed++;
> - if (!pagevec_add(&freed_pvec, page)) {
> - __pagevec_free(&freed_pvec);
> - pagevec_reinit(&freed_pvec);
> - }
> + list_add(&page->lru, freed_pages_list);
> continue;
>
> cull_mlocked:
> @@ -812,8 +808,6 @@ keep:
> VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
> }
> list_splice(&ret_pages, page_list);
> - if (pagevec_count(&freed_pvec))
> - __pagevec_free(&freed_pvec);
> count_vm_events(PGACTIVATE, pgactivate);
> return nr_reclaimed;
> }
> @@ -1100,6 +1094,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> int priority, int file)
> {
> LIST_HEAD(page_list);
> + LIST_HEAD(freed_pages_list);
> struct pagevec pvec;
> unsigned long nr_scanned;
> unsigned long nr_reclaimed = 0;
> @@ -1174,7 +1169,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>
> spin_unlock_irq(&zone->lru_lock);
>
> - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> + nr_reclaimed = shrink_page_list(&page_list, &freed_pages_list, sc,
> + PAGEOUT_IO_ASYNC);
>
> /*
> * If we are direct reclaiming for contiguous pages and we do
> @@ -1192,10 +1188,15 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> nr_active = clear_active_flags(&page_list, count);
> count_vm_events(PGDEACTIVATE, nr_active);
>
> - nr_reclaimed += shrink_page_list(&page_list, sc,
> - PAGEOUT_IO_SYNC);
> + nr_reclaimed += shrink_page_list(&page_list, &freed_pages_list,
> + sc, PAGEOUT_IO_SYNC);
> }
>
> + /*
> + * Free unused pages.
> + */
> + free_pages_bulk(zone, nr_reclaimed, &freed_pages_list);
> +
> local_irq_disable();
> if (current_is_kswapd())
> __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
This patch does not stand-alone so it's not easy to test. I'll think about
the idea more although I do see how it might help slightly in the same way
capture-reclaim did by closing the race window with other allocators.
I'm curious, how did you evaluate this and what problem did you
encounter that this might help?
Thanks
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
On Fri, 27 Nov 2009, KOSAKI Motohiro wrote:
> patch series, but Christoph mentioned simple bypass pcp instead.
> I made it. I'd hear Christoph and Mel's mention.
Ah. good.
> + kmemcheck_free_shadow(page, 0);
> +
> + if (PageAnon(page))
> + page->mapping = NULL;
> + if (free_pages_check(page)) {
> + /* orphan this page. */
> + list_del(&page->lru);
> + continue;
> + }
> + if (!PageHighMem(page)) {
> + debug_check_no_locks_freed(page_address(page),
> + PAGE_SIZE);
> + debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
> + }
> + arch_free_page(page, 0);
> + kernel_map_pages(page, 1, 0);
> +
> + local_irq_save(flags);
> + if (unlikely(wasMlocked))
> + free_page_mlock(page);
> + local_irq_restore(flags);
The above looks like it should be generic logic that is used elsewhere?
Create a common function?
Rest looks good to me...
Hi
sorry for the delayed reply. I've got stucked in Larry's serious bug report awhile.
> On Fri, Nov 27, 2009 at 09:23:57AM +0900, KOSAKI Motohiro wrote:
> >
> > note: Last year, Andy Whitcroft reported pcp prevent to make contenious
> > high order page when lumpy reclaim is running.
>
> I don't remember the specifics of the discussion but I know that when
> that patch series was being prototyped, it was because order-0
> allocations were racing with lumpy reclaimers. A lumpy reclaim might
> free up an order-9 page say but while it was freeing, an order-0 page
> would be allocated from the middle. It wasn't the PCP lists as such that
> were a problem once they were getting drained as part of a high-order
> allocation attempt. It would be just as bad if the order-0 page was
> taken from the buddy lists.
Hm, probably I have to update my patch description.
if we use pavevec_free(), batch size is PAGEVEC_SIZE(=14).
then, order-9 lumpy reclaim makes 37 times pagevec_free(). it makes lots
temporary uncontenious memory block and the chance of stealing it from
order-0 allocator task.
This patch free all reclaimed pages at once to buddy.
> > He posted "capture pages freed during direct reclaim for allocation by the reclaimer"
> > patch series, but Christoph mentioned simple bypass pcp instead.
> > I made it. I'd hear Christoph and Mel's mention.
> >
> > ==========================
> > Currently vmscan free unused pages by __pagevec_free(). It mean free pages one by one
> > and use pcp. it makes two suboptimal result.
> >
> > - The another task can steal the freed page in pcp easily. it decrease
> > lumpy reclaim worth.
> > - To pollute pcp cache, vmscan freed pages might kick out cache hot
> > pages from pcp.
> >
>
> The latter point is interesting.
Thank you.
> > This patch make new free_pages_bulk() function and vmscan use it.
> >
> > Signed-off-by: KOSAKI Motohiro <[email protected]>
> > ---
> > include/linux/gfp.h | 2 +
> > mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > mm/vmscan.c | 23 +++++++++++----------
> > 3 files changed, 70 insertions(+), 11 deletions(-)
> >
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index f53e9b8..403584d 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -330,6 +330,8 @@ extern void free_hot_page(struct page *page);
> > #define __free_page(page) __free_pages((page), 0)
> > #define free_page(addr) free_pages((addr),0)
> >
> > +void free_pages_bulk(struct zone *zone, int count, struct list_head *list);
> > +
> > void page_alloc_init(void);
> > void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
> > void drain_all_pages(void);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 11ae66e..f77f8a8 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -2037,6 +2037,62 @@ void free_pages(unsigned long addr, unsigned int order)
> >
> > EXPORT_SYMBOL(free_pages);
> >
> > +/*
> > + * Frees a number of pages from the list
> > + * Assumes all pages on list are in same zone and order==0.
> > + * count is the number of pages to free.
> > + *
> > + * This is similar to __pagevec_free(), but receive list instead pagevec.
> > + * and this don't use pcp cache. it is good characteristics for vmscan.
> > + */
> > +void free_pages_bulk(struct zone *zone, int count, struct list_head *list)
> > +{
> > + unsigned long flags;
> > + struct page *page;
> > + struct page *page2;
> > +
> > + list_for_each_entry_safe(page, page2, list, lru) {
> > + int wasMlocked = __TestClearPageMlocked(page);
> > +
> > + kmemcheck_free_shadow(page, 0);
> > +
> > + if (PageAnon(page))
> > + page->mapping = NULL;
> > + if (free_pages_check(page)) {
> > + /* orphan this page. */
> > + list_del(&page->lru);
> > + continue;
> > + }
> > + if (!PageHighMem(page)) {
> > + debug_check_no_locks_freed(page_address(page),
> > + PAGE_SIZE);
> > + debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
> > + }
> > + arch_free_page(page, 0);
> > + kernel_map_pages(page, 1, 0);
> > +
> > + local_irq_save(flags);
> > + if (unlikely(wasMlocked))
> > + free_page_mlock(page);
> > + local_irq_restore(flags);
> > + }
> > +
> > + spin_lock_irqsave(&zone->lock, flags);
> > + __count_vm_events(PGFREE, count);
> > + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
> > + zone->pages_scanned = 0;
> > +
> > + __mod_zone_page_state(zone, NR_FREE_PAGES, count);
> > +
> > + list_for_each_entry_safe(page, page2, list, lru) {
> > + /* have to delete it as __free_one_page list manipulates */
> > + list_del(&page->lru);
> > + trace_mm_page_free_direct(page, 0);
> > + __free_one_page(page, zone, 0, page_private(page));
> > + }
> > + spin_unlock_irqrestore(&zone->lock, flags);
> > +}
>
> It would be preferable that the bulk free code would use as much of the
> existing free logic in the page allocator as possible. This is making a
> lot of checks that are done elsewhere. As this is an RFC, it's not
> critical but worth bearing in mind.
Sure. I have to merge common block. thanks.
> > +
> > /**
> > * alloc_pages_exact - allocate an exact number physically-contiguous pages.
> > * @size: the number of bytes to allocate
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 56faefb..00156f2 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -598,18 +598,17 @@ redo:
> > * shrink_page_list() returns the number of reclaimed pages
> > */
> > static unsigned long shrink_page_list(struct list_head *page_list,
> > + struct list_head *freed_pages_list,
> > struct scan_control *sc,
>
> Should the freed_pages_list be part of scan_control?
OK.
>
> > enum pageout_io sync_writeback)
> > {
> > LIST_HEAD(ret_pages);
> > - struct pagevec freed_pvec;
> > int pgactivate = 0;
> > unsigned long nr_reclaimed = 0;
> > unsigned long vm_flags;
> >
> > cond_resched();
> >
> > - pagevec_init(&freed_pvec, 1);
> > while (!list_empty(page_list)) {
> > struct address_space *mapping;
> > struct page *page;
> > @@ -785,10 +784,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > __clear_page_locked(page);
> > free_it:
> > nr_reclaimed++;
> > - if (!pagevec_add(&freed_pvec, page)) {
> > - __pagevec_free(&freed_pvec);
> > - pagevec_reinit(&freed_pvec);
> > - }
> > + list_add(&page->lru, freed_pages_list);
> > continue;
> >
> > cull_mlocked:
> > @@ -812,8 +808,6 @@ keep:
> > VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
> > }
> > list_splice(&ret_pages, page_list);
> > - if (pagevec_count(&freed_pvec))
> > - __pagevec_free(&freed_pvec);
> > count_vm_events(PGACTIVATE, pgactivate);
> > return nr_reclaimed;
> > }
> > @@ -1100,6 +1094,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > int priority, int file)
> > {
> > LIST_HEAD(page_list);
> > + LIST_HEAD(freed_pages_list);
> > struct pagevec pvec;
> > unsigned long nr_scanned;
> > unsigned long nr_reclaimed = 0;
> > @@ -1174,7 +1169,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> >
> > spin_unlock_irq(&zone->lru_lock);
> >
> > - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> > + nr_reclaimed = shrink_page_list(&page_list, &freed_pages_list, sc,
> > + PAGEOUT_IO_ASYNC);
> >
> > /*
> > * If we are direct reclaiming for contiguous pages and we do
> > @@ -1192,10 +1188,15 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > nr_active = clear_active_flags(&page_list, count);
> > count_vm_events(PGDEACTIVATE, nr_active);
> >
> > - nr_reclaimed += shrink_page_list(&page_list, sc,
> > - PAGEOUT_IO_SYNC);
> > + nr_reclaimed += shrink_page_list(&page_list, &freed_pages_list,
> > + sc, PAGEOUT_IO_SYNC);
> > }
> >
> > + /*
> > + * Free unused pages.
> > + */
> > + free_pages_bulk(zone, nr_reclaimed, &freed_pages_list);
> > +
> > local_irq_disable();
> > if (current_is_kswapd())
> > __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
>
> This patch does not stand-alone so it's not easy to test. I'll think about
> the idea more although I do see how it might help slightly in the same way
> capture-reclaim did by closing the race window with other allocators.
>
> I'm curious, how did you evaluate this and what problem did you
> encounter that this might help?
Honestly I didn't it yet. I only tested changing locking scheme didn't cause
reclaim throughput under light VM pressure. Probably I have to contact
Andy and test his original problem workload.
btw, if you have good high order allocation workload, can you please tell me it?
On Wed, Dec 02, 2009 at 04:15:37PM +0900, KOSAKI Motohiro wrote:
> Hi
>
> sorry for the delayed reply. I've got stucked in Larry's serious bug report awhile.
>
No worries. I am slow to respond at the best of times.
> > On Fri, Nov 27, 2009 at 09:23:57AM +0900, KOSAKI Motohiro wrote:
> > >
> > > note: Last year, Andy Whitcroft reported pcp prevent to make contenious
> > > high order page when lumpy reclaim is running.
> >
> > I don't remember the specifics of the discussion but I know that when
> > that patch series was being prototyped, it was because order-0
> > allocations were racing with lumpy reclaimers. A lumpy reclaim might
> > free up an order-9 page say but while it was freeing, an order-0 page
> > would be allocated from the middle. It wasn't the PCP lists as such that
> > were a problem once they were getting drained as part of a high-order
> > allocation attempt. It would be just as bad if the order-0 page was
> > taken from the buddy lists.
>
> Hm, probably I have to update my patch description.
> if we use pavevec_free(), batch size is PAGEVEC_SIZE(=14).
> then, order-9 lumpy reclaim makes 37 times pagevec_free(). it makes lots
> temporary uncontenious memory block and the chance of stealing it from
> order-0 allocator task.
>
Very true. It opens a wide window during with other allocation requests
can race with the lumpy reclaimer and undo their work.
> This patch free all reclaimed pages at once to buddy.
>
Which is good. It reduces the window during which trouble can happen
considerably.
> > > He posted "capture pages freed during direct reclaim for allocation by the reclaimer"
> > > patch series, but Christoph mentioned simple bypass pcp instead.
> > > I made it. I'd hear Christoph and Mel's mention.
> > >
> > > ==========================
> > > Currently vmscan free unused pages by __pagevec_free(). It mean free pages one by one
> > > and use pcp. it makes two suboptimal result.
> > >
> > > - The another task can steal the freed page in pcp easily. it decrease
> > > lumpy reclaim worth.
> > > - To pollute pcp cache, vmscan freed pages might kick out cache hot
> > > pages from pcp.
> > >
> >
> > The latter point is interesting.
>
> Thank you.
>
Another point is that lumpy reclaim releases pages via the PCP means that
a part of the contiguous page is "stuck" in the PCP lists. This is evaded
by doing a drain_all_pages() for high-order allocation requests that are
failing. I suspect that your patch will reduce the number of times the PCP
lists are drained.
> > > This patch make new free_pages_bulk() function and vmscan use it.
> > >
> > > Signed-off-by: KOSAKI Motohiro <[email protected]>
> > > ---
> > > include/linux/gfp.h | 2 +
> > > mm/page_alloc.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > > mm/vmscan.c | 23 +++++++++++----------
> > > 3 files changed, 70 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > > index f53e9b8..403584d 100644
> > > --- a/include/linux/gfp.h
> > > +++ b/include/linux/gfp.h
> > > @@ -330,6 +330,8 @@ extern void free_hot_page(struct page *page);
> > > #define __free_page(page) __free_pages((page), 0)
> > > #define free_page(addr) free_pages((addr),0)
> > >
> > > +void free_pages_bulk(struct zone *zone, int count, struct list_head *list);
> > > +
> > > void page_alloc_init(void);
> > > void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
> > > void drain_all_pages(void);
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 11ae66e..f77f8a8 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -2037,6 +2037,62 @@ void free_pages(unsigned long addr, unsigned int order)
> > >
> > > EXPORT_SYMBOL(free_pages);
> > >
> > > +/*
> > > + * Frees a number of pages from the list
> > > + * Assumes all pages on list are in same zone and order==0.
> > > + * count is the number of pages to free.
> > > + *
> > > + * This is similar to __pagevec_free(), but receive list instead pagevec.
> > > + * and this don't use pcp cache. it is good characteristics for vmscan.
> > > + */
> > > +void free_pages_bulk(struct zone *zone, int count, struct list_head *list)
> > > +{
> > > + unsigned long flags;
> > > + struct page *page;
> > > + struct page *page2;
> > > +
> > > + list_for_each_entry_safe(page, page2, list, lru) {
> > > + int wasMlocked = __TestClearPageMlocked(page);
> > > +
> > > + kmemcheck_free_shadow(page, 0);
> > > +
> > > + if (PageAnon(page))
> > > + page->mapping = NULL;
> > > + if (free_pages_check(page)) {
> > > + /* orphan this page. */
> > > + list_del(&page->lru);
> > > + continue;
> > > + }
> > > + if (!PageHighMem(page)) {
> > > + debug_check_no_locks_freed(page_address(page),
> > > + PAGE_SIZE);
> > > + debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
> > > + }
> > > + arch_free_page(page, 0);
> > > + kernel_map_pages(page, 1, 0);
> > > +
> > > + local_irq_save(flags);
> > > + if (unlikely(wasMlocked))
> > > + free_page_mlock(page);
> > > + local_irq_restore(flags);
> > > + }
> > > +
> > > + spin_lock_irqsave(&zone->lock, flags);
> > > + __count_vm_events(PGFREE, count);
> > > + zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
> > > + zone->pages_scanned = 0;
> > > +
> > > + __mod_zone_page_state(zone, NR_FREE_PAGES, count);
> > > +
> > > + list_for_each_entry_safe(page, page2, list, lru) {
> > > + /* have to delete it as __free_one_page list manipulates */
> > > + list_del(&page->lru);
> > > + trace_mm_page_free_direct(page, 0);
> > > + __free_one_page(page, zone, 0, page_private(page));
> > > + }
> > > + spin_unlock_irqrestore(&zone->lock, flags);
> > > +}
> >
> > It would be preferable that the bulk free code would use as much of the
> > existing free logic in the page allocator as possible. This is making a
> > lot of checks that are done elsewhere. As this is an RFC, it's not
> > critical but worth bearing in mind.
>
> Sure. I have to merge common block. thanks.
>
>
> > > +
> > > /**
> > > * alloc_pages_exact - allocate an exact number physically-contiguous pages.
> > > * @size: the number of bytes to allocate
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 56faefb..00156f2 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -598,18 +598,17 @@ redo:
> > > * shrink_page_list() returns the number of reclaimed pages
> > > */
> > > static unsigned long shrink_page_list(struct list_head *page_list,
> > > + struct list_head *freed_pages_list,
> > > struct scan_control *sc,
> >
> > Should the freed_pages_list be part of scan_control?
>
> OK.
>
> >
> > > enum pageout_io sync_writeback)
> > > {
> > > LIST_HEAD(ret_pages);
> > > - struct pagevec freed_pvec;
> > > int pgactivate = 0;
> > > unsigned long nr_reclaimed = 0;
> > > unsigned long vm_flags;
> > >
> > > cond_resched();
> > >
> > > - pagevec_init(&freed_pvec, 1);
> > > while (!list_empty(page_list)) {
> > > struct address_space *mapping;
> > > struct page *page;
> > > @@ -785,10 +784,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> > > __clear_page_locked(page);
> > > free_it:
> > > nr_reclaimed++;
> > > - if (!pagevec_add(&freed_pvec, page)) {
> > > - __pagevec_free(&freed_pvec);
> > > - pagevec_reinit(&freed_pvec);
> > > - }
> > > + list_add(&page->lru, freed_pages_list);
> > > continue;
> > >
> > > cull_mlocked:
> > > @@ -812,8 +808,6 @@ keep:
> > > VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
> > > }
> > > list_splice(&ret_pages, page_list);
> > > - if (pagevec_count(&freed_pvec))
> > > - __pagevec_free(&freed_pvec);
> > > count_vm_events(PGACTIVATE, pgactivate);
> > > return nr_reclaimed;
> > > }
> > > @@ -1100,6 +1094,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > > int priority, int file)
> > > {
> > > LIST_HEAD(page_list);
> > > + LIST_HEAD(freed_pages_list);
> > > struct pagevec pvec;
> > > unsigned long nr_scanned;
> > > unsigned long nr_reclaimed = 0;
> > > @@ -1174,7 +1169,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > >
> > > spin_unlock_irq(&zone->lru_lock);
> > >
> > > - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> > > + nr_reclaimed = shrink_page_list(&page_list, &freed_pages_list, sc,
> > > + PAGEOUT_IO_ASYNC);
> > >
> > > /*
> > > * If we are direct reclaiming for contiguous pages and we do
> > > @@ -1192,10 +1188,15 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > > nr_active = clear_active_flags(&page_list, count);
> > > count_vm_events(PGDEACTIVATE, nr_active);
> > >
> > > - nr_reclaimed += shrink_page_list(&page_list, sc,
> > > - PAGEOUT_IO_SYNC);
> > > + nr_reclaimed += shrink_page_list(&page_list, &freed_pages_list,
> > > + sc, PAGEOUT_IO_SYNC);
> > > }
> > >
> > > + /*
> > > + * Free unused pages.
> > > + */
> > > + free_pages_bulk(zone, nr_reclaimed, &freed_pages_list);
> > > +
> > > local_irq_disable();
> > > if (current_is_kswapd())
> > > __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
> >
> > This patch does not stand-alone so it's not easy to test. I'll think about
> > the idea more although I do see how it might help slightly in the same way
> > capture-reclaim did by closing the race window with other allocators.
> >
> > I'm curious, how did you evaluate this and what problem did you
> > encounter that this might help?
>
> Honestly I didn't it yet. I only tested changing locking scheme didn't cause
> reclaim throughput under light VM pressure. Probably I have to contact
> Andy and test his original problem workload.
>
> btw, if you have good high order allocation workload, can you please tell me it?
>
For the most part, he was using the same tests as I was using for the
anti-fragmentation patches - high-order allocation requests under a heavy
compile-load. The expectation was that capture-based reclaim would increase
success rates and reduce latencies.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
> This patch depend on "vmscan : simplify code" patch (written by Huang Shijie)
>
> =========================================
> Now, max_scan of shrink_inactive_list() is always passed less than
> SWAP_CLUSTER_MAX. then, we can remove scanning pages loop in it.
>
> detail
> - remove "while (nr_scanned < max_scan)" loop
> - remove nr_freed variable (now, we use nr_reclaimed directly)
> - remove nr_scan variable (now, we use nr_scanned directly)
> - rename max_scan to nr_to_scan
> - pass nr_to_scan into isolate_pages() directly instead
> using SWAP_CLUSTER_MAX
Andrew, please don't pick up this patch series awhile. I and some vmscan
folks are working on fixing Larry's AIM7 problem. it is important than this
and it might makes conflict against this.
I'll rebase this patch series awhile after.
Hi KOSAKI,
It would be easier to review if you can split the changes and the big
"unshift" :)
Otherwise it looks good to me. (actually I've advocated it before ;)
Thanks,
Fengguang
On Fri, Nov 27, 2009 at 09:17:48AM +0900, KOSAKI Motohiro wrote:
> This patch depend on "vmscan : simplify code" patch (written by Huang Shijie)
>
> =========================================
> Now, max_scan of shrink_inactive_list() is always passed less than
> SWAP_CLUSTER_MAX. then, we can remove scanning pages loop in it.
>
> detail
> - remove "while (nr_scanned < max_scan)" loop
> - remove nr_freed variable (now, we use nr_reclaimed directly)
> - remove nr_scan variable (now, we use nr_scanned directly)
> - rename max_scan to nr_to_scan
> - pass nr_to_scan into isolate_pages() directly instead
> using SWAP_CLUSTER_MAX
>
> Signed-off-by: KOSAKI Motohiro <[email protected]>
> ---
> mm/vmscan.c | 194 ++++++++++++++++++++++++++++-------------------------------
> 1 files changed, 91 insertions(+), 103 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7536991..a58ff15 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1085,16 +1085,22 @@ static int too_many_isolated(struct zone *zone, int file,
> * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
> * of reclaimed pages
> */
> -static unsigned long shrink_inactive_list(unsigned long max_scan,
> - struct zone *zone, struct scan_control *sc,
> - int priority, int file)
> +static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> + struct zone *zone, struct scan_control *sc,
> + int priority, int file)
> {
> LIST_HEAD(page_list);
> struct pagevec pvec;
> - unsigned long nr_scanned = 0;
> + unsigned long nr_scanned;
> unsigned long nr_reclaimed = 0;
> struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
> int lumpy_reclaim = 0;
> + struct page *page;
> + unsigned long nr_taken;
> + unsigned long nr_active;
> + unsigned int count[NR_LRU_LISTS] = { 0, };
> + unsigned long nr_anon;
> + unsigned long nr_file;
>
> while (unlikely(too_many_isolated(zone, file, sc))) {
> congestion_wait(BLK_RW_ASYNC, HZ/10);
> @@ -1120,119 +1126,101 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
>
> lru_add_drain();
> spin_lock_irq(&zone->lru_lock);
> - do {
> - struct page *page;
> - unsigned long nr_taken;
> - unsigned long nr_scan;
> - unsigned long nr_freed;
> - unsigned long nr_active;
> - unsigned int count[NR_LRU_LISTS] = { 0, };
> - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
> - unsigned long nr_anon;
> - unsigned long nr_file;
> -
> - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
> - &page_list, &nr_scan, sc->order, mode,
> - zone, sc->mem_cgroup, 0, file);
> + nr_taken = sc->isolate_pages(nr_to_scan,
> + &page_list, &nr_scanned, sc->order,
> + lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE,
> + zone, sc->mem_cgroup, 0, file);
>
> - if (scanning_global_lru(sc)) {
> - zone->pages_scanned += nr_scan;
> - if (current_is_kswapd())
> - __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> - nr_scan);
> - else
> - __count_zone_vm_events(PGSCAN_DIRECT, zone,
> - nr_scan);
> - }
> + if (scanning_global_lru(sc)) {
> + zone->pages_scanned += nr_scanned;
> + if (current_is_kswapd())
> + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
> + else
> + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
> + }
>
> - if (nr_taken == 0)
> - goto done;
> + if (nr_taken == 0)
> + goto done;
>
> - nr_active = clear_active_flags(&page_list, count);
> - __count_vm_events(PGDEACTIVATE, nr_active);
> + nr_active = clear_active_flags(&page_list, count);
> + __count_vm_events(PGDEACTIVATE, nr_active);
>
> - __mod_zone_page_state(zone, NR_ACTIVE_FILE,
> - -count[LRU_ACTIVE_FILE]);
> - __mod_zone_page_state(zone, NR_INACTIVE_FILE,
> - -count[LRU_INACTIVE_FILE]);
> - __mod_zone_page_state(zone, NR_ACTIVE_ANON,
> - -count[LRU_ACTIVE_ANON]);
> - __mod_zone_page_state(zone, NR_INACTIVE_ANON,
> - -count[LRU_INACTIVE_ANON]);
> + __mod_zone_page_state(zone, NR_ACTIVE_FILE,
> + -count[LRU_ACTIVE_FILE]);
> + __mod_zone_page_state(zone, NR_INACTIVE_FILE,
> + -count[LRU_INACTIVE_FILE]);
> + __mod_zone_page_state(zone, NR_ACTIVE_ANON,
> + -count[LRU_ACTIVE_ANON]);
> + __mod_zone_page_state(zone, NR_INACTIVE_ANON,
> + -count[LRU_INACTIVE_ANON]);
>
> - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
> - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
> - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
> - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
> + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
> + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
> + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
> + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
>
> - reclaim_stat->recent_scanned[0] += nr_anon;
> - reclaim_stat->recent_scanned[1] += nr_file;
> + reclaim_stat->recent_scanned[0] += nr_anon;
> + reclaim_stat->recent_scanned[1] += nr_file;
>
> - spin_unlock_irq(&zone->lru_lock);
> + spin_unlock_irq(&zone->lru_lock);
>
> - nr_scanned += nr_scan;
> - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> +
> + /*
> + * If we are direct reclaiming for contiguous pages and we do
> + * not reclaim everything in the list, try again and wait
> + * for IO to complete. This will stall high-order allocations
> + * but that should be acceptable to the caller
> + */
> + if (nr_reclaimed < nr_taken && !current_is_kswapd() && lumpy_reclaim) {
> + congestion_wait(BLK_RW_ASYNC, HZ/10);
>
> /*
> - * If we are direct reclaiming for contiguous pages and we do
> - * not reclaim everything in the list, try again and wait
> - * for IO to complete. This will stall high-order allocations
> - * but that should be acceptable to the caller
> + * The attempt at page out may have made some
> + * of the pages active, mark them inactive again.
> */
> - if (nr_freed < nr_taken && !current_is_kswapd() &&
> - lumpy_reclaim) {
> - congestion_wait(BLK_RW_ASYNC, HZ/10);
> -
> - /*
> - * The attempt at page out may have made some
> - * of the pages active, mark them inactive again.
> - */
> - nr_active = clear_active_flags(&page_list, count);
> - count_vm_events(PGDEACTIVATE, nr_active);
> -
> - nr_freed += shrink_page_list(&page_list, sc,
> - PAGEOUT_IO_SYNC);
> - }
> + nr_active = clear_active_flags(&page_list, count);
> + count_vm_events(PGDEACTIVATE, nr_active);
>
> - nr_reclaimed += nr_freed;
> + nr_reclaimed += shrink_page_list(&page_list, sc,
> + PAGEOUT_IO_SYNC);
> + }
>
> - local_irq_disable();
> - if (current_is_kswapd())
> - __count_vm_events(KSWAPD_STEAL, nr_freed);
> - __count_zone_vm_events(PGSTEAL, zone, nr_freed);
> + local_irq_disable();
> + if (current_is_kswapd())
> + __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
> + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
>
> - spin_lock(&zone->lru_lock);
> - /*
> - * Put back any unfreeable pages.
> - */
> - while (!list_empty(&page_list)) {
> - int lru;
> - page = lru_to_page(&page_list);
> - VM_BUG_ON(PageLRU(page));
> - list_del(&page->lru);
> - if (unlikely(!page_evictable(page, NULL))) {
> - spin_unlock_irq(&zone->lru_lock);
> - putback_lru_page(page);
> - spin_lock_irq(&zone->lru_lock);
> - continue;
> - }
> - SetPageLRU(page);
> - lru = page_lru(page);
> - add_page_to_lru_list(zone, page, lru);
> - if (is_active_lru(lru)) {
> - int file = is_file_lru(lru);
> - reclaim_stat->recent_rotated[file]++;
> - }
> - if (!pagevec_add(&pvec, page)) {
> - spin_unlock_irq(&zone->lru_lock);
> - __pagevec_release(&pvec);
> - spin_lock_irq(&zone->lru_lock);
> - }
> + spin_lock(&zone->lru_lock);
> + /*
> + * Put back any unfreeable pages.
> + */
> + while (!list_empty(&page_list)) {
> + int lru;
> + page = lru_to_page(&page_list);
> + VM_BUG_ON(PageLRU(page));
> + list_del(&page->lru);
> + if (unlikely(!page_evictable(page, NULL))) {
> + spin_unlock_irq(&zone->lru_lock);
> + putback_lru_page(page);
> + spin_lock_irq(&zone->lru_lock);
> + continue;
> }
> - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
> - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
> -
> - } while (nr_scanned < max_scan);
> + SetPageLRU(page);
> + lru = page_lru(page);
> + add_page_to_lru_list(zone, page, lru);
> + if (is_active_lru(lru)) {
> + int file = is_file_lru(lru);
> + reclaim_stat->recent_rotated[file]++;
> + }
> + if (!pagevec_add(&pvec, page)) {
> + spin_unlock_irq(&zone->lru_lock);
> + __pagevec_release(&pvec);
> + spin_lock_irq(&zone->lru_lock);
> + }
> + }
> + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
> + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
>
> done:
> spin_unlock_irq(&zone->lru_lock);
> --
> 1.6.5.2
>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> - pass nr_to_scan into isolate_pages() directly instead
> using SWAP_CLUSTER_MAX
This patch will make sure nr_to_scan==SWAP_CLUSTER_MAX :)
Thanks,
Fengguang
---
vmscan: aligned scan batching
Make sure ->isolate_pages() always scans in unit of SWAP_CLUSTER_MAX.
CC: Rik van Riel <[email protected]>
CC: KOSAKI Motohiro <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/vmscan.c | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)
--- linux-mm.orig/mm/vmscan.c 2009-12-06 13:13:28.000000000 +0800
+++ linux-mm/mm/vmscan.c 2009-12-06 13:31:21.000000000 +0800
@@ -1572,15 +1572,11 @@ static void get_scan_ratio(struct zone *
static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
unsigned long *nr_saved_scan)
{
- unsigned long nr;
+ unsigned long nr = *nr_saved_scan + nr_to_scan;
+ unsigned long rem = nr & (SWAP_CLUSTER_MAX - 1);
- *nr_saved_scan += nr_to_scan;
- nr = *nr_saved_scan;
-
- if (nr >= SWAP_CLUSTER_MAX)
- *nr_saved_scan = 0;
- else
- nr = 0;
+ *nr_saved_scan = rem;
+ nr -= rem;
return nr;
}
@@ -1592,7 +1588,6 @@ static void shrink_zone(int priority, st
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
- unsigned long nr_to_scan;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
unsigned long nr_reclaimed = sc->nr_reclaimed;
@@ -1625,11 +1620,8 @@ static void shrink_zone(int priority, st
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr[l]) {
- nr_to_scan = min_t(unsigned long,
- nr[l], SWAP_CLUSTER_MAX);
- nr[l] -= nr_to_scan;
-
- nr_reclaimed += shrink_list(l, nr_to_scan,
+ nr[l] -= SWAP_CLUSTER_MAX;
+ nr_reclaimed += shrink_list(l, SWAP_CLUSTER_MAX,
zone, sc, priority);
}
}
> Hi KOSAKI,
>
> It would be easier to review if you can split the changes and the big
> "unshift" :)
Umm. I don't like break checkpatch.pl rule ;)
if you have enough time, can you please use diff -b yourself?
> Otherwise it looks good to me. (actually I've advocated it before ;)
>
> Thanks,
> Fengguang
>
> On Fri, Nov 27, 2009 at 09:17:48AM +0900, KOSAKI Motohiro wrote:
> > This patch depend on "vmscan : simplify code" patch (written by Huang Shijie)
> >
> > =========================================
> > Now, max_scan of shrink_inactive_list() is always passed less than
> > SWAP_CLUSTER_MAX. then, we can remove scanning pages loop in it.
> >
> > detail
> > - remove "while (nr_scanned < max_scan)" loop
> > - remove nr_freed variable (now, we use nr_reclaimed directly)
> > - remove nr_scan variable (now, we use nr_scanned directly)
> > - rename max_scan to nr_to_scan
> > - pass nr_to_scan into isolate_pages() directly instead
> > using SWAP_CLUSTER_MAX
> >
> > Signed-off-by: KOSAKI Motohiro <[email protected]>
> > ---
> > mm/vmscan.c | 194 ++++++++++++++++++++++++++++-------------------------------
> > 1 files changed, 91 insertions(+), 103 deletions(-)
> >
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 7536991..a58ff15 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1085,16 +1085,22 @@ static int too_many_isolated(struct zone *zone, int file,
> > * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
> > * of reclaimed pages
> > */
> > -static unsigned long shrink_inactive_list(unsigned long max_scan,
> > - struct zone *zone, struct scan_control *sc,
> > - int priority, int file)
> > +static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > + struct zone *zone, struct scan_control *sc,
> > + int priority, int file)
> > {
> > LIST_HEAD(page_list);
> > struct pagevec pvec;
> > - unsigned long nr_scanned = 0;
> > + unsigned long nr_scanned;
> > unsigned long nr_reclaimed = 0;
> > struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
> > int lumpy_reclaim = 0;
> > + struct page *page;
> > + unsigned long nr_taken;
> > + unsigned long nr_active;
> > + unsigned int count[NR_LRU_LISTS] = { 0, };
> > + unsigned long nr_anon;
> > + unsigned long nr_file;
> >
> > while (unlikely(too_many_isolated(zone, file, sc))) {
> > congestion_wait(BLK_RW_ASYNC, HZ/10);
> > @@ -1120,119 +1126,101 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
> >
> > lru_add_drain();
> > spin_lock_irq(&zone->lru_lock);
> > - do {
> > - struct page *page;
> > - unsigned long nr_taken;
> > - unsigned long nr_scan;
> > - unsigned long nr_freed;
> > - unsigned long nr_active;
> > - unsigned int count[NR_LRU_LISTS] = { 0, };
> > - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
> > - unsigned long nr_anon;
> > - unsigned long nr_file;
> > -
> > - nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
> > - &page_list, &nr_scan, sc->order, mode,
> > - zone, sc->mem_cgroup, 0, file);
> > + nr_taken = sc->isolate_pages(nr_to_scan,
> > + &page_list, &nr_scanned, sc->order,
> > + lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE,
> > + zone, sc->mem_cgroup, 0, file);
> >
> > - if (scanning_global_lru(sc)) {
> > - zone->pages_scanned += nr_scan;
> > - if (current_is_kswapd())
> > - __count_zone_vm_events(PGSCAN_KSWAPD, zone,
> > - nr_scan);
> > - else
> > - __count_zone_vm_events(PGSCAN_DIRECT, zone,
> > - nr_scan);
> > - }
> > + if (scanning_global_lru(sc)) {
> > + zone->pages_scanned += nr_scanned;
> > + if (current_is_kswapd())
> > + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
> > + else
> > + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
> > + }
> >
> > - if (nr_taken == 0)
> > - goto done;
> > + if (nr_taken == 0)
> > + goto done;
> >
> > - nr_active = clear_active_flags(&page_list, count);
> > - __count_vm_events(PGDEACTIVATE, nr_active);
> > + nr_active = clear_active_flags(&page_list, count);
> > + __count_vm_events(PGDEACTIVATE, nr_active);
> >
> > - __mod_zone_page_state(zone, NR_ACTIVE_FILE,
> > - -count[LRU_ACTIVE_FILE]);
> > - __mod_zone_page_state(zone, NR_INACTIVE_FILE,
> > - -count[LRU_INACTIVE_FILE]);
> > - __mod_zone_page_state(zone, NR_ACTIVE_ANON,
> > - -count[LRU_ACTIVE_ANON]);
> > - __mod_zone_page_state(zone, NR_INACTIVE_ANON,
> > - -count[LRU_INACTIVE_ANON]);
> > + __mod_zone_page_state(zone, NR_ACTIVE_FILE,
> > + -count[LRU_ACTIVE_FILE]);
> > + __mod_zone_page_state(zone, NR_INACTIVE_FILE,
> > + -count[LRU_INACTIVE_FILE]);
> > + __mod_zone_page_state(zone, NR_ACTIVE_ANON,
> > + -count[LRU_ACTIVE_ANON]);
> > + __mod_zone_page_state(zone, NR_INACTIVE_ANON,
> > + -count[LRU_INACTIVE_ANON]);
> >
> > - nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
> > - nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
> > - __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
> > - __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
> > + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
> > + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
> > + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
> > + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
> >
> > - reclaim_stat->recent_scanned[0] += nr_anon;
> > - reclaim_stat->recent_scanned[1] += nr_file;
> > + reclaim_stat->recent_scanned[0] += nr_anon;
> > + reclaim_stat->recent_scanned[1] += nr_file;
> >
> > - spin_unlock_irq(&zone->lru_lock);
> > + spin_unlock_irq(&zone->lru_lock);
> >
> > - nr_scanned += nr_scan;
> > - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> > + nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
> > +
> > + /*
> > + * If we are direct reclaiming for contiguous pages and we do
> > + * not reclaim everything in the list, try again and wait
> > + * for IO to complete. This will stall high-order allocations
> > + * but that should be acceptable to the caller
> > + */
> > + if (nr_reclaimed < nr_taken && !current_is_kswapd() && lumpy_reclaim) {
> > + congestion_wait(BLK_RW_ASYNC, HZ/10);
> >
> > /*
> > - * If we are direct reclaiming for contiguous pages and we do
> > - * not reclaim everything in the list, try again and wait
> > - * for IO to complete. This will stall high-order allocations
> > - * but that should be acceptable to the caller
> > + * The attempt at page out may have made some
> > + * of the pages active, mark them inactive again.
> > */
> > - if (nr_freed < nr_taken && !current_is_kswapd() &&
> > - lumpy_reclaim) {
> > - congestion_wait(BLK_RW_ASYNC, HZ/10);
> > -
> > - /*
> > - * The attempt at page out may have made some
> > - * of the pages active, mark them inactive again.
> > - */
> > - nr_active = clear_active_flags(&page_list, count);
> > - count_vm_events(PGDEACTIVATE, nr_active);
> > -
> > - nr_freed += shrink_page_list(&page_list, sc,
> > - PAGEOUT_IO_SYNC);
> > - }
> > + nr_active = clear_active_flags(&page_list, count);
> > + count_vm_events(PGDEACTIVATE, nr_active);
> >
> > - nr_reclaimed += nr_freed;
> > + nr_reclaimed += shrink_page_list(&page_list, sc,
> > + PAGEOUT_IO_SYNC);
> > + }
> >
> > - local_irq_disable();
> > - if (current_is_kswapd())
> > - __count_vm_events(KSWAPD_STEAL, nr_freed);
> > - __count_zone_vm_events(PGSTEAL, zone, nr_freed);
> > + local_irq_disable();
> > + if (current_is_kswapd())
> > + __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
> > + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
> >
> > - spin_lock(&zone->lru_lock);
> > - /*
> > - * Put back any unfreeable pages.
> > - */
> > - while (!list_empty(&page_list)) {
> > - int lru;
> > - page = lru_to_page(&page_list);
> > - VM_BUG_ON(PageLRU(page));
> > - list_del(&page->lru);
> > - if (unlikely(!page_evictable(page, NULL))) {
> > - spin_unlock_irq(&zone->lru_lock);
> > - putback_lru_page(page);
> > - spin_lock_irq(&zone->lru_lock);
> > - continue;
> > - }
> > - SetPageLRU(page);
> > - lru = page_lru(page);
> > - add_page_to_lru_list(zone, page, lru);
> > - if (is_active_lru(lru)) {
> > - int file = is_file_lru(lru);
> > - reclaim_stat->recent_rotated[file]++;
> > - }
> > - if (!pagevec_add(&pvec, page)) {
> > - spin_unlock_irq(&zone->lru_lock);
> > - __pagevec_release(&pvec);
> > - spin_lock_irq(&zone->lru_lock);
> > - }
> > + spin_lock(&zone->lru_lock);
> > + /*
> > + * Put back any unfreeable pages.
> > + */
> > + while (!list_empty(&page_list)) {
> > + int lru;
> > + page = lru_to_page(&page_list);
> > + VM_BUG_ON(PageLRU(page));
> > + list_del(&page->lru);
> > + if (unlikely(!page_evictable(page, NULL))) {
> > + spin_unlock_irq(&zone->lru_lock);
> > + putback_lru_page(page);
> > + spin_lock_irq(&zone->lru_lock);
> > + continue;
> > }
> > - __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
> > - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
> > -
> > - } while (nr_scanned < max_scan);
> > + SetPageLRU(page);
> > + lru = page_lru(page);
> > + add_page_to_lru_list(zone, page, lru);
> > + if (is_active_lru(lru)) {
> > + int file = is_file_lru(lru);
> > + reclaim_stat->recent_rotated[file]++;
> > + }
> > + if (!pagevec_add(&pvec, page)) {
> > + spin_unlock_irq(&zone->lru_lock);
> > + __pagevec_release(&pvec);
> > + spin_lock_irq(&zone->lru_lock);
> > + }
> > + }
> > + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
> > + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
> >
> > done:
> > spin_unlock_irq(&zone->lru_lock);
> > --
> > 1.6.5.2
> >
> >
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to [email protected]. For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
> > - pass nr_to_scan into isolate_pages() directly instead
> > using SWAP_CLUSTER_MAX
>
> This patch will make sure nr_to_scan==SWAP_CLUSTER_MAX :)
>
> Thanks,
> Fengguang
> ---
> vmscan: aligned scan batching
>
> Make sure ->isolate_pages() always scans in unit of SWAP_CLUSTER_MAX.
>
> CC: Rik van Riel <[email protected]>
> CC: KOSAKI Motohiro <[email protected]>
> Signed-off-by: Wu Fengguang <[email protected]>
> ---
> mm/vmscan.c | 20 ++++++--------------
> 1 file changed, 6 insertions(+), 14 deletions(-)
Hm, Your patch always pass SWAP_CLUSTER_MAX to shrink_list().
Can we remove nr_to_scan argument of shrink_list() completely?
Anyway, this diffstat itsef explain this patch's worth. I'll queue
this patch into my vmscan cleanup branch.
>
> --- linux-mm.orig/mm/vmscan.c 2009-12-06 13:13:28.000000000 +0800
> +++ linux-mm/mm/vmscan.c 2009-12-06 13:31:21.000000000 +0800
> @@ -1572,15 +1572,11 @@ static void get_scan_ratio(struct zone *
> static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
> unsigned long *nr_saved_scan)
> {
> - unsigned long nr;
> + unsigned long nr = *nr_saved_scan + nr_to_scan;
> + unsigned long rem = nr & (SWAP_CLUSTER_MAX - 1);
>
> - *nr_saved_scan += nr_to_scan;
> - nr = *nr_saved_scan;
> -
> - if (nr >= SWAP_CLUSTER_MAX)
> - *nr_saved_scan = 0;
> - else
> - nr = 0;
> + *nr_saved_scan = rem;
> + nr -= rem;
>
> return nr;
> }
> @@ -1592,7 +1588,6 @@ static void shrink_zone(int priority, st
> struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> - unsigned long nr_to_scan;
> unsigned long percent[2]; /* anon @ 0; file @ 1 */
> enum lru_list l;
> unsigned long nr_reclaimed = sc->nr_reclaimed;
> @@ -1625,11 +1620,8 @@ static void shrink_zone(int priority, st
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> if (nr[l]) {
> - nr_to_scan = min_t(unsigned long,
> - nr[l], SWAP_CLUSTER_MAX);
> - nr[l] -= nr_to_scan;
> -
> - nr_reclaimed += shrink_list(l, nr_to_scan,
> + nr[l] -= SWAP_CLUSTER_MAX;
> + nr_reclaimed += shrink_list(l, SWAP_CLUSTER_MAX,
> zone, sc, priority);
> }
> }