LinuxLists.cc - [PATCH 5/5] linear reclaim core

2006-09-08 12:27:51

Subject: [PATCH 5/5] linear reclaim core

linear reclaim core

When we are out of memory of a suitable size we enter reclaim.
The current reclaim algorithm targets pages in LRU order, which
is great for fairness but highly unsuitable if you desire pages at
higher orders. To get pages of higher order we must shoot down a
very high proportion of memory; >95% in a lot of cases.

This patch introduces an alternative algorithm used when requesting
higher order allocations. Here we look at memory in ranges at the
order requested. We make a quick pass to see if all pages in that
area are likely to be reclaimed, only then do we apply reclaim to
the pages in the area.

Testing in combination with fragmentation avoidance shows
significantly improved chances of a successful allocation at
higher order.

Added in: V1

Issues:
o no longer need to be MAX_ORDER aligned with new iterators

Signed-off-by: Andy Whitcroft <[email protected]>
---
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8c09638..83cafb9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -199,6 +199,7 @@ #endif
unsigned long nr_inactive;
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
+ unsigned long linear_scan_hand;

/* The accumulated number of activities that may cause page aging,
* that is, make some pages closer to the tail of inactive_list.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 825a433..9c71a2d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2049,6 +2049,7 @@ __meminit int init_currently_empty_zone(
pgdat->nr_zones = zone_idx(zone) + 1;

zone->zone_start_pfn = zone_start_pfn;
+ zone->linear_scan_hand = zone_start_pfn;

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4a72976..a88061d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -974,6 +974,9 @@ static unsigned long shrink_zones(int pr
return nr_reclaimed;
}

+static unsigned long pressure_zones(int, struct zone **, int,
+ struct scan_control *);
+
/*
* This is the main entry point to direct page reclaim.
*
@@ -1021,7 +1024,11 @@ unsigned long try_to_free_pages(struct z
sc.nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zones, &sc);
+ if (order <= 3)
+ nr_reclaimed += shrink_zones(priority, zones, &sc);
+ else
+ nr_reclaimed += pressure_zones(priority, zones,
+ order, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1685,3 +1692,294 @@ int zone_reclaim(struct zone *zone, gfp_
return __zone_reclaim(zone, gfp_mask, order);
}
#endif
+
+static int page_likely_reclaimable(struct page *page)
+{
+ if (PageLocked(page))
+ return 0;
+ if (PageReserved(page))
+ return 0;
+ if (PageSlab(page))
+ return 0;
+ if (PageCompound(page))
+ return 0;
+ return 1;
+}
+
+static unsigned long shrink_linear_pages(struct list_head *inactive,
+ struct list_head *active, struct scan_control *sc)
+{
+ unsigned long nr_free;
+
+ /*
+ * If the inactive pages free ok then its worth
+ * attempting to free the active ones in this chunk.
+ */
+ nr_free = shrink_page_list(inactive, sc);
+ if (list_empty(inactive)) {
+ struct list_head *entry;
+ struct page *page;
+
+ list_for_each(entry, active) {
+ page = list_entry(entry, struct page, lru);
+ ClearPageActive(page);
+ }
+ nr_free += shrink_page_list(active, sc);
+ }
+
+ return nr_free;
+}
+
+/*
+ * Find the real first page in an area. This is either the real first
+ * page at pfn, or the free page which coveres this area.
+ */
+static struct page *page_find_area_start(unsigned long pfn)
+{
+ int order;
+ int mask = 1;
+ struct page *orig_page = pfn_to_page(pfn);
+ struct page *page = orig_page;
+
+ for (order = 0; order < MAX_ORDER; order++, mask <<= 1) {
+ if (pfn_valid_within(pfn) && PageBuddy(page) &&
+ page_order(page) >= order)
+ return page;
+ if (pfn & mask) {
+ pfn &= ~mask;
+ page -= mask;
+ }
+ }
+
+ return orig_page;
+}
+
+/*
+ * Scan the pages indicates, pulling out any pages on the LRU
+ * onto a list of active and a list of inactive pages.
+ *
+ * zone->lru_lock must be held, with interrupts disabled.
+ */
+static unsigned long isolate_linear_pages(struct zone *zone,
+ unsigned long size, unsigned long pfn,
+ struct list_head *page_list, struct list_head *page_list_a)
+{
+ int nr_active = 0;
+ int nr_inactive = 0;
+ struct page *page;
+ struct page *start;
+ struct page *end;
+
+ start = page_find_area_start(pfn);
+ end = pfn_to_page(pfn) + size;
+ for (page = start; page < end; page++) {
+ if (!pfn_valid_within(pfn + (page - start)))
+ continue;
+ if (PageBuddy(page)) {
+ page += (1 << page_order(page)) - 1;
+ continue;
+ }
+ if (unlikely(PageCompound(page)))
+ continue;
+
+ if (PageLRU(page)) {
+ list_del(&page->lru);
+ if (!get_page_unless_zero(page)) {
+ /*
+ * It is being freed elsewhere, this
+ * is also good.
+ */
+ if (PageActive(page))
+ list_add(&page->lru,
+ &zone->active_list);
+ else
+ list_add(&page->lru,
+ &zone->inactive_list);
+ continue;
+ }
+ ClearPageLRU(page);
+ if (PageActive(page)) {
+ nr_active++;
+ list_add(&page->lru, page_list_a);
+ } else {
+ nr_inactive++;
+ list_add(&page->lru, page_list);
+ }
+ }
+ }
+ zone->nr_active -= nr_active;
+ zone->nr_inactive -= nr_inactive;
+ zone->pages_scanned += nr_inactive + nr_active;
+
+ return nr_inactive + nr_active;
+}
+
+/*
+ * Apply pressure to the zone to try and free pages at the specified order.
+ */
+static unsigned long pressure_zone(int priority, struct zone *zone, int order,
+ struct scan_control *sc)
+{
+ LIST_HEAD(page_list);
+ LIST_HEAD(page_list_a);
+ struct pagevec pvec;
+
+ unsigned long pfn;
+ unsigned long pfn_zone_end;
+ unsigned long pfn_scan_end;
+ unsigned long size = (1 << order);
+ unsigned long mask = (~0 << order);
+ unsigned long start;
+
+ unsigned long nr_to_scan = (zone->spanned_pages >> priority) + size;
+ unsigned long nr_reclaimed = 0;
+ long nr_seen;
+ long nr_free;
+ long nr_likely;
+ long nr_scan;
+
+ long blocks_scanned = 0;
+
+ pagevec_init(&pvec, 1);
+
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+
+ pfn_zone_end = zone->zone_start_pfn + zone->spanned_pages;
+ pfn_scan_end = pfn_zone_end;
+
+ /*
+ * Calculate our current start point. Note the buddy allocator
+ * merge algorithm means that a free block is marked at its low
+ * edge. This means we _must_ scan from low to high and start
+ * either at a MAX_ORDER boundry or the low end of the zone otherwise
+ * we cannot determine if _this_ page is free. Also we want to
+ * start aligned with the requested block size.
+ */
+ start = zone->linear_scan_hand & mask;
+scan_wrap:
+ if (start < zone->zone_start_pfn)
+ start += size;
+
+ for (pfn = start; pfn < pfn_scan_end; pfn += size) {
+ struct page *page;
+ struct page *chunk_start;
+ struct page *chunk_end;
+
+ /*
+ * Handle memory models where we can have invalid areas
+ * within the zone. Note we are making use of the
+ * assumption that mem_map is contigious out to
+ * MAX_ORDER to allow us to check just the start of the
+ * block.
+ */
+ if (!pfn_valid(pfn))
+ continue;
+
+ /*
+ * If we have scanned a reasonable amount of blocks,
+ * and are on a MAX_ORDER boundry, we are in a reasonble
+ * position to stop.
+ */
+ if (blocks_scanned >= (nr_to_scan / size) &&
+ (pfn & ~(1 << (MAX_ORDER-1))) == pfn)
+ break;
+
+ /* Do a quick evaluation pass over the area. */
+ nr_seen = nr_likely = 0;
+
+ chunk_start = page_find_area_start(pfn);
+ chunk_end = pfn_to_page(pfn) + size;
+ for (page = chunk_start; page < chunk_end; page++) {
+ if (!pfn_valid_within(pfn + (page - chunk_start))) {
+ nr_seen++;
+ continue;
+ }
+
+ if (PageBuddy(page)) {
+ page += (1 << page_order(page)) - 1;
+ continue;
+ }
+
+ if (page_likely_reclaimable(page))
+ nr_likely++;
+ nr_seen++;
+ }
+ if (nr_likely < nr_seen || nr_seen == 0)
+ continue;
+
+ /* Pull out any in use pages ready for freeing. */
+ nr_scan = isolate_linear_pages(zone, size, pfn, &page_list,
+ &page_list_a);
+ spin_unlock_irq(&zone->lru_lock);
+
+ if (nr_scan > 0) {
+ nr_free = shrink_linear_pages(&page_list,
+ &page_list_a, sc);
+ blocks_scanned++;
+ } else
+ nr_free = 0;
+
+ nr_reclaimed += nr_free;
+
+ /*
+ * shrink_linear_pages may have converted any of our pages
+ * to active. We therefore need to handle both lists the
+ * same. Merge them.
+ */
+ list_splice_init(&page_list_a, &page_list);
+
+ /* Update the accounting information. */
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ __count_vm_events(KSWAPD_STEAL, nr_free);
+ } else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+ __count_vm_events(PGACTIVATE, nr_free);
+
+ spin_lock(&zone->lru_lock);
+
+ return_unfreeable_pages(&page_list, zone, &pvec);
+ }
+ if (pfn >= pfn_zone_end) {
+ pfn_scan_end = start;
+ start = zone->zone_start_pfn & mask;
+ goto scan_wrap;
+ }
+ zone->linear_scan_hand = pfn;
+
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ sc->nr_scanned += blocks_scanned * size;
+
+ return nr_reclaimed;
+}
+
+static unsigned long pressure_zones(int priority, struct zone **zones,
+ int order, struct scan_control *sc)
+{
+ unsigned long nr_reclaimed = 0;
+ int i;
+
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *zone = zones[i];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
+ zone->temp_priority = priority;
+ if (zone->prev_priority > priority)
+ zone->prev_priority = priority;
+
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue; /* Let kswapd poll it */
+
+ nr_reclaimed += pressure_zone(priority, zone, order, sc);
+ }
+ return nr_reclaimed;
+}

2006-09-08 18:41:35

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

On Fri, 8 Sep 2006 13:27:18 +0100
Andy Whitcroft <[email protected]> wrote:

> When we are out of memory of a suitable size we enter reclaim.
> The current reclaim algorithm targets pages in LRU order, which
> is great for fairness but highly unsuitable if you desire pages at
> higher orders. To get pages of higher order we must shoot down a
> very high proportion of memory; >95% in a lot of cases.
>
> This patch introduces an alternative algorithm used when requesting
> higher order allocations. Here we look at memory in ranges at the
> order requested. We make a quick pass to see if all pages in that
> area are likely to be reclaimed, only then do we apply reclaim to
> the pages in the area.
>
> Testing in combination with fragmentation avoidance shows
> significantly improved chances of a successful allocation at
> higher order.

I bet it does.

I'm somewhat surprised at the implementation. Would it not be sufficient
to do this within shrink_inactive_list()? Something along the lines of:

- Pick tail page off LRU.

- For all "neighbour" pages (alignment == 1<<order, count == 1<<order)

- If they're all PageLRU and !PageActive, add them all to page_list for
possible reclaim

And, in shrink_active_list:

- Pick tail page off LRU

- For all "neighbour" pages (alignment == 1<<order, count == 1<<order)

If they're all PageLRU, put all the active pages in this block onto
l_hold for possible deactivation.

Maybe all that can be done in isolate_lru_pages().

2006-09-10 02:24:20

by Andy Whitcroft

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

Andrew Morton wrote:
> On Fri, 8 Sep 2006 13:27:18 +0100
> Andy Whitcroft <[email protected]> wrote:
>
>> When we are out of memory of a suitable size we enter reclaim.
>> The current reclaim algorithm targets pages in LRU order, which
>> is great for fairness but highly unsuitable if you desire pages at
>> higher orders. To get pages of higher order we must shoot down a
>> very high proportion of memory; >95% in a lot of cases.
>>
>> This patch introduces an alternative algorithm used when requesting
>> higher order allocations. Here we look at memory in ranges at the
>> order requested. We make a quick pass to see if all pages in that
>> area are likely to be reclaimed, only then do we apply reclaim to
>> the pages in the area.
>>
>> Testing in combination with fragmentation avoidance shows
>> significantly improved chances of a successful allocation at
>> higher order.
>
> I bet it does.
>
> I'm somewhat surprised at the implementation. Would it not be sufficient
> to do this within shrink_inactive_list()? Something along the lines of:
>
> - Pick tail page off LRU.
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> - If they're all PageLRU and !PageActive, add them all to page_list for
> possible reclaim
>
> And, in shrink_active_list:
>
> - Pick tail page off LRU
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> If they're all PageLRU, put all the active pages in this block onto
> l_hold for possible deactivation.
>
>
> Maybe all that can be done in isolate_lru_pages().

When we started out down this road we though we needed to scan linearly
too due to the buddy marking scheme. Now that we're at this end of the
road we know thats pretty easy to fix, we're already considering merging
the two reclaims anyhow.

Probabally this would do something pretty similar. It feels at a quick
glance as it its going to have slightly different semantics, but that
may be better or worse.

I'll go try it out and see how it looks.

Thanks for reading.

-apw

2006-09-10 09:52:22

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

On Fri, 2006-09-08 at 11:41 -0700, Andrew Morton wrote:

> - Pick tail page off LRU.
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> - If they're all PageLRU and !PageActive, add them all to page_list for
> possible reclaim
>
> And, in shrink_active_list:
>
> - Pick tail page off LRU
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> If they're all PageLRU, put all the active pages in this block onto
> l_hold for possible deactivation.
>
>
> Maybe all that can be done in isolate_lru_pages().

Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/buffer.c | 2 -
include/linux/swap.h | 2 -
mm/page_alloc.c | 2 -
mm/vmscan.c | 70 ++++++++++++++++++++++++++++++++++++---------------
4 files changed, 53 insertions(+), 23 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2006-07-22 00:11:00.000000000 +0200
+++ linux-2.6/mm/vmscan.c 2006-09-10 11:47:05.000000000 +0200
@@ -62,6 +62,8 @@ struct scan_control {
int swap_cluster_max;

int swappiness;
+
+ int order;
};

/*
@@ -590,35 +592,62 @@ keep:
*
* returns how many pages were moved onto *@dst.
*/
+int __isolate_lru_page(struct page *page, int active)
+{
+ int ret = -EINVAL;
+
+ if (PageLRU(page) && (PageActive(page) == active)) {
+ ret = -EBUSY;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+ }
+
+ return ret;
+}
+
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned)
+ unsigned long *scanned, int order)
{
unsigned long nr_taken = 0;
struct page *page;
- unsigned long scan;
+ unsigned long scan, pfn, base_pfn;
+ int active;

- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- struct list_head *target;
+ for (scan = 0; scan < nr_to_scan && !list_empty(src);) {
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);

BUG_ON(!PageLRU(page));

- list_del(&page->lru);
- target = src;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- target = dst;
+ active = PageActive(page);
+ pfn = page_to_pfn(page);
+ base_pfn = pfn &= ~((1 << order) - 1);
+ for (; pfn < (base_pfn + (1 << order)) && pfn_valid(pfn); pfn++) {
+ struct page *tmp = pfn_to_page(pfn);
+ int ret;
+
+ BUG_ON(!tmp);
+
+ ret = __isolate_lru_page(tmp, active);
+ scan++;
+ if (ret) {
+ if (ret == -EBUSY) {
+ /* else it is being freed elsewhere */
+ list_move(&tmp->lru, src);
+ }
+ break;
+ } else
+ list_move(&tmp->lru, dst);
nr_taken++;
- } /* else it is being freed elsewhere */
-
- list_add(&page->lru, target);
+ }
}

*scanned = scan;
@@ -649,7 +678,7 @@ static unsigned long shrink_inactive_lis

nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
- &page_list, &nr_scan);
+ &page_list, &nr_scan, sc->order);
zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
@@ -771,7 +800,7 @@ static void shrink_active_list(unsigned
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
+ &l_hold, &pgscanned, sc->order);
zone->pages_scanned += pgscanned;
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
@@ -959,7 +988,7 @@ static unsigned long shrink_zones(int pr
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -974,6 +1003,7 @@ unsigned long try_to_free_pages(struct z
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .order = order,
};

count_vm_event(ALLOCSTALL);
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2006-09-08 18:13:56.000000000 +0200
+++ linux-2.6/fs/buffer.c 2006-09-09 23:55:21.000000000 +0200
@@ -498,7 +498,7 @@ static void free_more_memory(void)
for_each_online_pgdat(pgdat) {
zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
if (*zones)
- try_to_free_pages(zones, GFP_NOFS);
+ try_to_free_pages(zones, 0, GFP_NOFS);
}
}

Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h 2006-09-08 18:13:56.000000000 +0200
+++ linux-2.6/include/linux/swap.h 2006-09-09 23:53:56.000000000 +0200
@@ -181,7 +181,7 @@ extern int rotate_reclaimable_page(struc
extern void swap_setup(void);

/* linux/mm/vmscan.c */
-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long try_to_free_pages(struct zone **, int, gfp_t);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2006-09-08 18:13:57.000000000 +0200
+++ linux-2.6/mm/page_alloc.c 2006-09-09 23:55:04.000000000 +0200
@@ -1000,7 +1000,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

2006-09-10 17:10:51

by Peter Zijlstra

[permalink] [raw]

Subject: [PATCH] lumpy reclaim -v2

Hi,

I found a live-lock in my previous version. Unfortunatly this one is a
bit longer.

The live lock would occur if we break out of the order loop before
moving the tail page ahead, this would lead to retrying the same order
block again and again.

This version first handles the tail page, and after that tries to
complete the block.

---

When trying to reclaim pages for a higher order allocation, make reclaim
try to move lumps of pages (fitting the requested order) about, instead
of single pages. This should significantly reduce the number of
reclaimed pages for higher order allocations.

Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/buffer.c | 2 -
include/linux/swap.h | 2 -
mm/page_alloc.c | 2 -
mm/vmscan.c | 94 ++++++++++++++++++++++++++++++++++++++++-----------
4 files changed, 77 insertions(+), 23 deletions(-)

Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2006-09-10 11:49:01.000000000 +0200
+++ linux-2.6/mm/vmscan.c 2006-09-10 18:47:19.000000000 +0200
@@ -62,6 +62,8 @@ struct scan_control {
int swap_cluster_max;

int swappiness;
+
+ int order;
};

/*
@@ -590,35 +592,86 @@ keep:
*
* returns how many pages were moved onto *@dst.
*/
+int __isolate_lru_page(struct page *page, int active)
+{
+ int ret = -EINVAL;
+
+ if (PageLRU(page) && (PageActive(page) == active)) {
+ ret = -EBUSY;
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ ClearPageLRU(page);
+ ret = 0;
+ }
+ }
+
+ return ret;
+}
+
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned)
+ unsigned long *scanned, int order)
{
unsigned long nr_taken = 0;
- struct page *page;
- unsigned long scan;
+ struct page *page, *tmp;
+ unsigned long scan, pfn, end_pfn, page_pfn;
+ int active;

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- struct list_head *target;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);

BUG_ON(!PageLRU(page));

- list_del(&page->lru);
- target = src;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- target = dst;
- nr_taken++;
- } /* else it is being freed elsewhere */
+ active = PageActive(page);
+ switch (__isolate_lru_page(page, active)) {
+ case 0:
+ list_move(&page->lru, dst);
+ nr_taken++;
+ break;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+
+ default:
+ BUG();
+ }

- list_add(&page->lru, target);
+ if (!order)
+ continue;
+
+ page_pfn = pfn = __page_to_pfn(page);
+ end_pfn = pfn &= ~((1 << order) - 1);
+ end_pfn += 1 << order;
+ for (; pfn < end_pfn; pfn++) {
+ if (unlikely(pfn == page_pfn))
+ continue;
+ if (unlikely(!pfn_valid(pfn)))
+ break;
+
+ scan++;
+ tmp = __pfn_to_page(pfn);
+ switch (__isolate_lru_page(tmp, active)) {
+ case 0:
+ list_move(&tmp->lru, dst);
+ nr_taken++;
+ continue;
+
+ case -EBUSY:
+ /* else it is being freed elsewhere */
+ list_move(&tmp->lru, src);
+ default:
+ break;
+
+ }
+ break;
+ }
}

*scanned = scan;
@@ -649,7 +702,7 @@ static unsigned long shrink_inactive_lis

nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
- &page_list, &nr_scan);
+ &page_list, &nr_scan, sc->order);
zone->nr_inactive -= nr_taken;
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
@@ -771,7 +824,7 @@ static void shrink_active_list(unsigned
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
+ &l_hold, &pgscanned, sc->order);
zone->pages_scanned += pgscanned;
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
@@ -959,7 +1012,7 @@ static unsigned long shrink_zones(int pr
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
{
int priority;
int ret = 0;
@@ -974,6 +1027,7 @@ unsigned long try_to_free_pages(struct z
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .order = order,
};

count_vm_event(ALLOCSTALL);
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2006-09-10 11:49:01.000000000 +0200
+++ linux-2.6/fs/buffer.c 2006-09-10 18:11:17.000000000 +0200
@@ -498,7 +498,7 @@ static void free_more_memory(void)
for_each_online_pgdat(pgdat) {
zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
if (*zones)
- try_to_free_pages(zones, GFP_NOFS);
+ try_to_free_pages(zones, 0, GFP_NOFS);
}
}

Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h 2006-09-10 11:49:01.000000000 +0200
+++ linux-2.6/include/linux/swap.h 2006-09-10 18:11:17.000000000 +0200
@@ -181,7 +181,7 @@ extern int rotate_reclaimable_page(struc
extern void swap_setup(void);

/* linux/mm/vmscan.c */
-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long try_to_free_pages(struct zone **, int, gfp_t);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2006-09-10 11:49:01.000000000 +0200
+++ linux-2.6/mm/page_alloc.c 2006-09-10 18:11:18.000000000 +0200
@@ -1000,7 +1000,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

2006-09-10 23:45:51

by Jörn Engel

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

On Fri, 8 September 2006 11:41:14 -0700, Andrew Morton wrote:
>
> I'm somewhat surprised at the implementation. Would it not be sufficient
> to do this within shrink_inactive_list()? Something along the lines of:
>
> - Pick tail page off LRU.
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> - If they're all PageLRU and !PageActive, add them all to page_list for
> possible reclaim
>
> And, in shrink_active_list:
>
> - Pick tail page off LRU
>
> - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
>
> If they're all PageLRU, put all the active pages in this block onto
> l_hold for possible deactivation.

Hmm. Trying to shoot holes into your approach, I find two potential
problems:
A) With sufficient fragmentation, all inactive pages have one active
neighbour, so shrink_inactive_list() will never find a cluster of the
required order.
B) With some likelihood, shrink_active_list() will pick neighbours
which happen to be rather hot pages. They get freed, only to get
paged in again within little more than rotational latency.

How about something like:
1. Free 1<<order pages from the inactive list.
2. Pick a page cluster of requested order.
3. Move all pages from the cluster to the just freed pages.

[ Disclaimer: I just started dabbling in mm after Andi Kleen's
presentation on Linux Kongress on friday. My tiny gem of knowledge,
if present at all, might be well hidden in the ignorance of an
mm-newbie. ]

J?rn

--
It's not whether you win or lose, it's how you place the blame.
-- unknown

2006-09-11 00:41:00

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

On Mon, 11 Sep 2006 01:45:09 +0200
J?rn Engel <[email protected]> wrote:

> On Fri, 8 September 2006 11:41:14 -0700, Andrew Morton wrote:
> >
> > I'm somewhat surprised at the implementation. Would it not be sufficient
> > to do this within shrink_inactive_list()? Something along the lines of:
> >
> > - Pick tail page off LRU.
> >
> > - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
> >
> > - If they're all PageLRU and !PageActive, add them all to page_list for
> > possible reclaim
> >
> > And, in shrink_active_list:
> >
> > - Pick tail page off LRU
> >
> > - For all "neighbour" pages (alignment == 1<<order, count == 1<<order)
> >
> > If they're all PageLRU, put all the active pages in this block onto
> > l_hold for possible deactivation.
>
> Hmm. Trying to shoot holes into your approach, I find two potential
> problems:
> A) With sufficient fragmentation, all inactive pages have one active
> neighbour, so shrink_inactive_list() will never find a cluster of the
> required order.

Nope. If the clump of pages has a mix of active and inactive, the above
design would cause the active ones to be deactivated, so now the entire
clump is eligible for treatment by shrink_inactive_list().

> B) With some likelihood, shrink_active_list() will pick neighbours
> which happen to be rather hot pages. They get freed, only to get
> paged in again within little more than rotational latency.

Maybe. Careful benchmarking and carefully-designed microbenchmarks are, as
always, needed.

Bear in mind that simply moving the pages to the inactive list isn't enough
to get them reclaimed: we still do various forms of page aging and the
pages can still be preserved due to that. IOW, we have several different
forms of page aging, one of which is LRU-ordering. The above design
compromises just one of those aging steps.

I'd be more concerned about higher-order atomic allocations. If this thing
is to work I suspect we'll need per-zone, per-order watermarks and kswapd
will need to maintain those.

> How about something like:
> 1. Free 1<<order pages from the inactive list.
> 2. Pick a page cluster of requested order.
> 3. Move all pages from the cluster to the just freed pages.

Don't think in terms of "freeing". Think in terms of "scanning". A lot of
page reclaim's balancing tricks are cast in terms of pages-scanned,
slabs-scanned, etc.

2006-09-11 07:34:04

by Jörn Engel

[permalink] [raw]

Subject: Re: [PATCH 5/5] linear reclaim core

On Sun, 10 September 2006 17:40:51 -0700, Andrew Morton wrote:
>
> > A) With sufficient fragmentation, all inactive pages have one active
> > neighbour, so shrink_inactive_list() will never find a cluster of the
> > required order.
>
> Nope. If the clump of pages has a mix of active and inactive, the above
> design would cause the active ones to be deactivated, so now the entire
> clump is eligible for treatment by shrink_inactive_list().

Ok? More reading seems necessary before I can follow you here...

> Bear in mind that simply moving the pages to the inactive list isn't enough
> to get them reclaimed: we still do various forms of page aging and the
> pages can still be preserved due to that. IOW, we have several different
> forms of page aging, one of which is LRU-ordering. The above design
> compromises just one of those aging steps.

Are these different forms of page aging described in written form
somewhere?

> I'd be more concerned about higher-order atomic allocations. If this thing
> is to work I suspect we'll need per-zone, per-order watermarks and kswapd
> will need to maintain those.

Or simply declare higher-order atomic allocations to be undesired?
Not sure how many of those we have that make sense.

> Don't think in terms of "freeing". Think in terms of "scanning". A lot of
> page reclaim's balancing tricks are cast in terms of pages-scanned,
> slabs-scanned, etc.

There is a related problem I'm sure you are aware of. Trying to
shrink the dentry_cache or the various foofs_inode_caches we remove
tons of objects before a full slab (in most cases a page) is free and
can be returned. ext3_inode_cache has 8 objects per slab,
dentry_cache has 29. That's the equivalent of an order-3 or order-5
page allocation in terms of inefficiency.

And having just started thinking about the problem, my envisioned
solution looks fairly similar to Andy's work for high-order
allocations here. Except that I cannot think in terms of "scanning",
afaict.

J?rn

--
Anything that can go wrong, will.
-- Finagle's Law