This patchset I have had around for a long time and improves
various zone and watermark balancing by making calculations
more logical.
When reading 128GB through the pagecache, in 4 concurrent
streams, the final page residency and total reclaim ratios
look like this (no highmem, ~900MB RAM):
2.6.14-git3
DMA pages= 2214, scan= 124146
NRM pages=215966, scan=3990129
Pages Scan
DMA 01.01 03.01
NRM 98.99 96.99
2.6.14-git3-vm
DMA pages= 2220, scan= 99264
NRM pages=216373, scan=4011975
Pages Scan
DMA 01.01 02.41
NRM 98.99 97.59
So in this case, DMA is still getting a beating, but things have
improved nicely. Now are results with highmem and ~4GB RAM:
2.6.14-git3
DMA pages=0, scan=0
NRM pages=177241, scan=1607991
HIG pages=817122, scan=1607166
Pages Scan
DMA 00.00 00.00
NRM 17.83 50.01
HIG 82.17 49.99
2.6.14-git3-vm
DMA pages=0, scan=0
NRM pages=178215, scan=553311
HIG pages=815771, scan=2757744
Pages Scan
DMA 00.00 00.00
NRM 17.92 16.71
HIG 82.07 83.28
Current kernels are abysmal, while the patches bring scanning to
an almost perfect ratio.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Explicitly teach kswapd about the incremental min logic instead of just scanning
all zones under the first low zone. This should keep more even pressure applied
on the zones.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2005-11-01 13:42:33.000000000 +1100
+++ linux-2.6/mm/vmscan.c 2005-11-01 14:27:16.000000000 +1100
@@ -1051,97 +1051,63 @@ loop_again:
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
+ int first_low_zone = 0;
all_zones_ok = 1;
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ sc.priority = priority;
+ sc.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX;
- if (nr_pages == 0) {
- /*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
- */
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
+ /* Scan in the highmem->dma direction */
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
- continue;
+ if (zone->present_pages == 0)
+ continue;
- if (zone->all_unreclaimable &&
- priority != DEF_PRIORITY)
+ if (nr_pages == 0) { /* Not software suspend */
+ if (zone_watermark_ok(zone, order,
+ zone->pages_high, first_low_zone, 0, 0))
continue;
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0, 0)) {
- end_zone = i;
- goto scan;
- }
+ all_zones_ok = 0;
+ if (first_low_zone < i)
+ first_low_zone = i;
}
- goto out;
- } else {
- end_zone = pgdat->nr_zones - 1;
- }
-scan:
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- lru_pages += zone->nr_active + zone->nr_inactive;
- }
-
- /*
- * Now scan the zone in the dma->highmem direction, stopping
- * at the last zone which needs scanning.
- *
- * We do this because the page allocator works in the opposite
- * direction. This prevents the page allocator from allocating
- * pages behind kswapd's direction of progress, which would
- * cause too much scanning of the lower zones.
- */
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
- int nr_slab;
-
- if (zone->present_pages == 0)
- continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- if (nr_pages == 0) { /* Not software suspend */
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0, 0))
- all_zones_ok = 0;
- }
zone->temp_priority = priority;
if (zone->prev_priority > priority)
zone->prev_priority = priority;
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+ lru_pages += zone->nr_active + zone->nr_inactive;
+
atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
atomic_dec(&zone->reclaim_in_progress);
- reclaim_state->reclaimed_slab = 0;
- nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_reclaimed += sc.nr_reclaimed;
- total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
- continue;
- if (nr_slab == 0 && zone->pages_scanned >=
+
+ if (zone->pages_scanned >=
(zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
- /*
- * If we've done a decent amount of scanning and
- * the reclaim ratio is low, start doing writepage
- * even in laptop mode
- */
- if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > total_reclaimed+total_reclaimed/2)
- sc.may_writepage = 1;
}
+ reclaim_state->reclaimed_slab = 0;
+ shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ total_reclaimed += sc.nr_reclaimed;
+ total_scanned += sc.nr_scanned;
+
+ /*
+ * If we've done a decent amount of scanning and
+ * the reclaim ratio is low, start doing writepage
+ * even in laptop mode
+ */
+ if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+ total_scanned > total_reclaimed+total_reclaimed/2)
+ sc.may_writepage = 1;
+
if (nr_pages && to_free > total_reclaimed)
continue; /* swsusp: need to do more work */
if (all_zones_ok)
@@ -1162,7 +1128,6 @@ scan:
if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
break;
}
-out:
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
The pages_high - pages_low and pages_low - pages_min deltas are the asynch
reclaim watermarks. As such, the should be in the same ratios as any other
zone for highmem zones. It is the pages_min - 0 delta which is the PF_MEMALLOC
reserve, and this is the region that isn't very useful for highmem.
This patch ensures highmem systems have similar characteristics as non highmem
ones with the same amount of memory, and also that highmem zones get similar
reclaim pressures to other zones.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2005-11-01 13:42:35.000000000 +1100
+++ linux-2.6/mm/page_alloc.c 2005-11-01 14:29:07.000000000 +1100
@@ -2374,13 +2374,18 @@ static void setup_per_zone_pages_min(voi
}
for_each_zone(zone) {
+ unsigned long tmp;
spin_lock_irqsave(&zone->lru_lock, flags);
+ tmp = (pages_min * zone->present_pages) / lowmem_pages;
if (is_highmem(zone)) {
/*
- * Often, highmem doesn't need to reserve any pages.
- * But the pages_min/low/high values are also used for
- * batching up page reclaim activity so we need a
- * decent value here.
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The (pages_high-pages_low) and (pages_low-pages_min)
+ * deltas controls asynch page reclaim, and so should
+ * not be capped for highmem.
*/
int min_pages;
@@ -2391,19 +2396,15 @@ static void setup_per_zone_pages_min(voi
min_pages = 128;
zone->pages_min = min_pages;
} else {
- /* if it's a lowmem zone, reserve a number of pages
+ /*
+ * If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->pages_min = (pages_min * zone->present_pages) /
- lowmem_pages;
+ zone->pages_min = tmp;
}
- /*
- * When interpreting these watermarks, just keep in mind that:
- * zone->pages_min == (zone->pages_min * 4) / 4;
- */
- zone->pages_low = (zone->pages_min * 5) / 4;
- zone->pages_high = (zone->pages_min * 6) / 4;
+ zone->pages_low = zone->pages_min + tmp / 4;
+ zone->pages_high = zone->pages_min + tmp / 2;
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
}
Slightly change the writeout watermark calculations so we keep background
and synchronous writeout watermarks in the same ratios after adjusting them.
This ensures we should always attempt to start background writeout before
synchronous writeout.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c 2005-11-01 13:41:39.000000000 +1100
+++ linux-2.6/mm/page-writeback.c 2005-11-01 14:29:27.000000000 +1100
@@ -165,9 +165,11 @@ get_dirty_limits(struct writeback_state
if (dirty_ratio < 5)
dirty_ratio = 5;
- background_ratio = dirty_background_ratio;
- if (background_ratio >= dirty_ratio)
- background_ratio = dirty_ratio / 2;
+ /*
+ * Keep the ratio between dirty_ratio and background_ratio roughly
+ * what the sysctls are after dirty_ratio has been scaled (above).
+ */
+ background_ratio = dirty_background_ratio * dirty_ratio/vm_dirty_ratio;
background = (background_ratio * available_memory) / 100;
dirty = (dirty_ratio * available_memory) / 100;
Hi Nick,
Looks nice, much easier to read than before.
One comment: you change the pagecache/slab scanning ratio by moving
shrink_slab() outside of the zone loop.
This means that for each kswapd iteration will scan "lru_pages"
SLAB entries, instead of "lru_pages*NR_ZONES" entries.
Can you comment on that?
On Tue, Nov 01, 2005 at 04:19:49PM +1100, Nick Piggin wrote:
> 1/3
>
> --
> SUSE Labs, Novell Inc.
>
> Explicitly teach kswapd about the incremental min logic instead of just scanning
> all zones under the first low zone. This should keep more even pressure applied
> on the zones.
>
> Signed-off-by: Nick Piggin <[email protected]>
>
>
> Index: linux-2.6/mm/vmscan.c
> ===================================================================
> --- linux-2.6.orig/mm/vmscan.c 2005-11-01 13:42:33.000000000 +1100
> +++ linux-2.6/mm/vmscan.c 2005-11-01 14:27:16.000000000 +1100
> @@ -1051,97 +1051,63 @@ loop_again:
> }
>
> for (priority = DEF_PRIORITY; priority >= 0; priority--) {
> - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
> unsigned long lru_pages = 0;
> + int first_low_zone = 0;
>
> all_zones_ok = 1;
> + sc.nr_scanned = 0;
> + sc.nr_reclaimed = 0;
> + sc.priority = priority;
> + sc.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX;
>
> - if (nr_pages == 0) {
> - /*
> - * Scan in the highmem->dma direction for the highest
> - * zone which needs scanning
> - */
> - for (i = pgdat->nr_zones - 1; i >= 0; i--) {
> - struct zone *zone = pgdat->node_zones + i;
> + /* Scan in the highmem->dma direction */
> + for (i = pgdat->nr_zones - 1; i >= 0; i--) {
> + struct zone *zone = pgdat->node_zones + i;
>
> - if (zone->present_pages == 0)
> - continue;
> + if (zone->present_pages == 0)
> + continue;
>
> - if (zone->all_unreclaimable &&
> - priority != DEF_PRIORITY)
> + if (nr_pages == 0) { /* Not software suspend */
> + if (zone_watermark_ok(zone, order,
> + zone->pages_high, first_low_zone, 0, 0))
> continue;
>
> - if (!zone_watermark_ok(zone, order,
> - zone->pages_high, 0, 0, 0)) {
> - end_zone = i;
> - goto scan;
> - }
> + all_zones_ok = 0;
> + if (first_low_zone < i)
> + first_low_zone = i;
> }
> - goto out;
> - } else {
> - end_zone = pgdat->nr_zones - 1;
> - }
> -scan:
> - for (i = 0; i <= end_zone; i++) {
> - struct zone *zone = pgdat->node_zones + i;
> -
> - lru_pages += zone->nr_active + zone->nr_inactive;
> - }
> -
> - /*
> - * Now scan the zone in the dma->highmem direction, stopping
> - * at the last zone which needs scanning.
> - *
> - * We do this because the page allocator works in the opposite
> - * direction. This prevents the page allocator from allocating
> - * pages behind kswapd's direction of progress, which would
> - * cause too much scanning of the lower zones.
> - */
> - for (i = 0; i <= end_zone; i++) {
> - struct zone *zone = pgdat->node_zones + i;
> - int nr_slab;
> -
> - if (zone->present_pages == 0)
> - continue;
>
> if (zone->all_unreclaimable && priority != DEF_PRIORITY)
> continue;
>
> - if (nr_pages == 0) { /* Not software suspend */
> - if (!zone_watermark_ok(zone, order,
> - zone->pages_high, end_zone, 0, 0))
> - all_zones_ok = 0;
> - }
> zone->temp_priority = priority;
> if (zone->prev_priority > priority)
> zone->prev_priority = priority;
> - sc.nr_scanned = 0;
> - sc.nr_reclaimed = 0;
> - sc.priority = priority;
> - sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
> + lru_pages += zone->nr_active + zone->nr_inactive;
> +
> atomic_inc(&zone->reclaim_in_progress);
> shrink_zone(zone, &sc);
> atomic_dec(&zone->reclaim_in_progress);
> - reclaim_state->reclaimed_slab = 0;
> - nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> - lru_pages);
> - sc.nr_reclaimed += reclaim_state->reclaimed_slab;
> - total_reclaimed += sc.nr_reclaimed;
> - total_scanned += sc.nr_scanned;
> - if (zone->all_unreclaimable)
> - continue;
> - if (nr_slab == 0 && zone->pages_scanned >=
> +
> + if (zone->pages_scanned >=
> (zone->nr_active + zone->nr_inactive) * 4)
> zone->all_unreclaimable = 1;
> - /*
> - * If we've done a decent amount of scanning and
> - * the reclaim ratio is low, start doing writepage
> - * even in laptop mode
> - */
> - if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
> - total_scanned > total_reclaimed+total_reclaimed/2)
> - sc.may_writepage = 1;
> }
> + reclaim_state->reclaimed_slab = 0;
> + shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
> + sc.nr_reclaimed += reclaim_state->reclaimed_slab;
> + total_reclaimed += sc.nr_reclaimed;
> + total_scanned += sc.nr_scanned;
> +
> + /*
> + * If we've done a decent amount of scanning and
> + * the reclaim ratio is low, start doing writepage
> + * even in laptop mode
> + */
> + if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
> + total_scanned > total_reclaimed+total_reclaimed/2)
> + sc.may_writepage = 1;
> +
> if (nr_pages && to_free > total_reclaimed)
> continue; /* swsusp: need to do more work */
> if (all_zones_ok)
> @@ -1162,7 +1128,6 @@ scan:
> if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
> break;
> }
> -out:
> for (i = 0; i < pgdat->nr_zones; i++) {
> struct zone *zone = pgdat->node_zones + i;
>
Nikita has a customer using large percentage of RAM for
a kernel module, which results in get_dirty_limits() misbehaviour
since
unsigned long available_memory = total_pages;
It should work on the amount of cacheable pages instead.
He's got a patch but I dont remember the URL. Nikita?
On Tue, Nov 01, 2005 at 04:21:15PM +1100, Nick Piggin wrote:
> 3/3
>
> --
> SUSE Labs, Novell Inc.
>
> Slightly change the writeout watermark calculations so we keep background
> and synchronous writeout watermarks in the same ratios after adjusting them.
> This ensures we should always attempt to start background writeout before
> synchronous writeout.
>
> Signed-off-by: Nick Piggin <[email protected]>
>
>
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c 2005-11-01 13:41:39.000000000 +1100
> +++ linux-2.6/mm/page-writeback.c 2005-11-01 14:29:27.000000000 +1100
> @@ -165,9 +165,11 @@ get_dirty_limits(struct writeback_state
> if (dirty_ratio < 5)
> dirty_ratio = 5;
>
> - background_ratio = dirty_background_ratio;
> - if (background_ratio >= dirty_ratio)
> - background_ratio = dirty_ratio / 2;
> + /*
> + * Keep the ratio between dirty_ratio and background_ratio roughly
> + * what the sysctls are after dirty_ratio has been scaled (above).
> + */
> + background_ratio = dirty_background_ratio * dirty_ratio/vm_dirty_ratio;
>
> background = (background_ratio * available_memory) / 100;
> dirty = (dirty_ratio * available_memory) / 100;
Marcelo Tosatti writes:
>
> Nikita has a customer using large percentage of RAM for
> a kernel module, which results in get_dirty_limits() misbehaviour
> since
>
> unsigned long available_memory = total_pages;
>
> It should work on the amount of cacheable pages instead.
>
> He's got a patch but I dont remember the URL. Nikita?
http://linuxhacker.ru/~nikita/patches/2.6.14-rc5/09-throttle-against-free-memory.patch
It changes balance_dirty_pages() to calculate threshold not from total
amount of physical pages, but from the maximal amount of pages that can
be consumed by the file system cache. This amount is approximated by
total size of LRU list plus free memory (across all zones).
This has a downside of starting write-out earlier, so patch should
probably be accompanied by some tuning of default thresholds.
Nikita.
>
> On Tue, Nov 01, 2005 at 04:21:15PM +1100, Nick Piggin wrote:
> > 3/3
> >
> > --
> > SUSE Labs, Novell Inc.
> >
>
> > Slightly change the writeout watermark calculations so we keep background
> > and synchronous writeout watermarks in the same ratios after adjusting them.
> > This ensures we should always attempt to start background writeout before
> > synchronous writeout.
> >
> > Signed-off-by: Nick Piggin <[email protected]>
> >
> >
> > Index: linux-2.6/mm/page-writeback.c
> > ===================================================================
> > --- linux-2.6.orig/mm/page-writeback.c 2005-11-01 13:41:39.000000000 +1100
> > +++ linux-2.6/mm/page-writeback.c 2005-11-01 14:29:27.000000000 +1100
> > @@ -165,9 +165,11 @@ get_dirty_limits(struct writeback_state
> > if (dirty_ratio < 5)
> > dirty_ratio = 5;
> >
> > - background_ratio = dirty_background_ratio;
> > - if (background_ratio >= dirty_ratio)
> > - background_ratio = dirty_ratio / 2;
> > + /*
> > + * Keep the ratio between dirty_ratio and background_ratio roughly
> > + * what the sysctls are after dirty_ratio has been scaled (above).
> > + */
> > + background_ratio = dirty_background_ratio * dirty_ratio/vm_dirty_ratio;
> >
> > background = (background_ratio * available_memory) / 100;
> > dirty = (dirty_ratio * available_memory) / 100;
Marcelo Tosatti wrote:
> Hi Nick,
>
> Looks nice, much easier to read than before.
>
Hi Marcelo,
Thanks! That was one of the main aims.
> One comment: you change the pagecache/slab scanning ratio by moving
> shrink_slab() outside of the zone loop.
>
> This means that for each kswapd iteration will scan "lru_pages"
> SLAB entries, instead of "lru_pages*NR_ZONES" entries.
>
> Can you comment on that?
>
I believe I have tried to get it right, let me explain. lru_pages
is just used as the divisor for the ratio between lru scanning
and slab scanning. So long as it is kept constant across calls to
shrink_slab, there should be no change in behaviour.
The the nr_scanned variable is the other half of the equation that
controls slab shrinking. I have changed from:
lru_pages = total_node_lru_pages;
for each zone in node {
shrink_zone();
shrink_slab(zone_scanned, lru_pages);
}
To:
lru_pages = 0;
for each zone in node {
shrink_zone();
lru_pages += zone_lru_pages;
}
shrink_slab(total_zone_scanned, lru_pages);
So the ratio remains basically the same
[eg. 10/100 + 20/100 + 30/100 = (10+20+30)/100]
2 reasons for doing this. The first is just efficiency and better
rounding of the divisions.
The second is that within the for_each_zone loop, we are able to
set all_unreclaimable without worrying about slab, because the
final shrink_slab at the end will clear all_unreclaimable if any
zones have had slab pages freed up.
I believe it generally should result in more consistent reclaim
across zones, and also matches direct reclaim better.
Hope this made sense,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Marcelo Tosatti wrote:
> Nikita has a customer using large percentage of RAM for
> a kernel module, which results in get_dirty_limits() misbehaviour
> since
>
> unsigned long available_memory = total_pages;
>
> It should work on the amount of cacheable pages instead.
>
> He's got a patch but I dont remember the URL. Nikita?
>
Indeed. This patch has a couple of little problems anyway, and
probably does not logicaly belong as part of this series.
I'll work on previous 2 more important patches first.
My patch should probably go on top of more fundamental work
like Nikita's patch.
Thanks,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Tue, Nov 08, 2005 at 10:08:53AM +1100, Nick Piggin wrote:
> Marcelo Tosatti wrote:
> >Hi Nick,
> >
> >Looks nice, much easier to read than before.
> >
>
> Hi Marcelo,
>
> Thanks! That was one of the main aims.
>
> >One comment: you change the pagecache/slab scanning ratio by moving
> >shrink_slab() outside of the zone loop.
> >
> >This means that for each kswapd iteration will scan "lru_pages"
> >SLAB entries, instead of "lru_pages*NR_ZONES" entries.
> >
> >Can you comment on that?
> >
>
> I believe I have tried to get it right, let me explain. lru_pages
> is just used as the divisor for the ratio between lru scanning
> and slab scanning. So long as it is kept constant across calls to
> shrink_slab, there should be no change in behaviour.
>
> The the nr_scanned variable is the other half of the equation that
> controls slab shrinking. I have changed from:
>
> lru_pages = total_node_lru_pages;
> for each zone in node {
> shrink_zone();
> shrink_slab(zone_scanned, lru_pages);
> }
>
> To:
>
> lru_pages = 0;
> for each zone in node {
> shrink_zone();
> lru_pages += zone_lru_pages;
> }
> shrink_slab(total_zone_scanned, lru_pages);
>
> So the ratio remains basically the same
> [eg. 10/100 + 20/100 + 30/100 = (10+20+30)/100]
>
> 2 reasons for doing this. The first is just efficiency and better
> rounding of the divisions.
>
> The second is that within the for_each_zone loop, we are able to
> set all_unreclaimable without worrying about slab, because the
> final shrink_slab at the end will clear all_unreclaimable if any
> zones have had slab pages freed up.
>
> I believe it generally should result in more consistent reclaim
> across zones, and also matches direct reclaim better.
>
> Hope this made sense,
Yes, makes sense. My reading was not correct.
Sounds great.