2001-07-18 02:02:19

by Marcelo Tosatti

[permalink] [raw]
Subject: Inclusion of zoned inactive/free shortage patch


Hi Linus,

The following patch (against 2.4.6-ac2, already merged in 2.4.6-ac3) adds
specific perzone inactive/free shortage handling code.

This avoids us from freeing/deactivating pages from all zones in case
there are only zone-specific free or inactive shortages.

Its not _strict_ perzone handling: we still have the global free/inactive
shortage handling.

This fixes most of the highmem problems (I'm not able to deadlock a 4GB
machine running memory-intensive programs with the patch anymore. I've
also received one success report from Dirk Wetter running two 2GB
simulations on a 4GB machine).

In case you don't have any objections against the patch I'll generate
it against the latest 2.4.7-pre for inclusion.

Please read it,


diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h Sat Jul 14 02:47:14 2001
+++ linux/include/linux/swap.h Sat Jul 14 03:27:13 2001
@@ -123,9 +123,14 @@
extern wait_queue_head_t kreclaimd_wait;
extern int page_launder(int, int);
extern int free_shortage(void);
+extern int total_free_shortage(void);
extern int inactive_shortage(void);
+extern int total_inactive_shortage(void);
extern void wakeup_kswapd(void);
extern int try_to_free_pages(unsigned int gfp_mask);
+
+extern unsigned int zone_free_shortage(zone_t *zone);
+extern unsigned int zone_inactive_shortage(zone_t *zone);

/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *);
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/page_alloc.c linux/mm/page_alloc.c
--- linux.orig/mm/page_alloc.c Sat Jul 14 02:47:14 2001
+++ linux/mm/page_alloc.c Sat Jul 14 02:50:50 2001
@@ -451,7 +451,7 @@
* to give up than to deadlock the kernel looping here.
*/
if (gfp_mask & __GFP_WAIT) {
- if (!order || free_shortage()) {
+ if (!order || total_free_shortage()) {
int progress = try_to_free_pages(gfp_mask);
if (progress || (gfp_mask & __GFP_FS))
goto try_again;
@@ -689,6 +689,39 @@
return pages;
}
#endif
+
+unsigned int zone_free_shortage(zone_t *zone)
+{
+ int sum = 0;
+
+ if (!zone->size)
+ goto ret;
+
+ if (zone->inactive_clean_pages + zone->free_pages
+ < zone->pages_min) {
+ sum += zone->pages_min;
+ sum -= zone->free_pages;
+ sum -= zone->inactive_clean_pages;
+ }
+ret:
+ return sum;
+}
+
+unsigned int zone_inactive_shortage(zone_t *zone)
+{
+ int sum = 0;
+
+ if (!zone->size)
+ goto ret;
+
+ sum = zone->pages_high;
+ sum -= zone->inactive_dirty_pages;
+ sum -= zone->inactive_clean_pages;
+ sum -= zone->free_pages;
+
+ret:
+ return (sum > 0 ? sum : 0);
+}

/*
* Show free area list (used inside shift_scroll-lock stuff)
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/vmscan.c linux/mm/vmscan.c
--- linux.orig/mm/vmscan.c Sat Jul 14 02:47:14 2001
+++ linux/mm/vmscan.c Sat Jul 14 03:22:19 2001
@@ -36,11 +36,19 @@
*/

/* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+static void try_to_swap_out(zone_t *zone, struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
{
pte_t pte;
swp_entry_t entry;

+ /*
+ * If we are doing a zone-specific scan, do not
+ * touch pages from zones which don't have a
+ * shortage.
+ */
+ if (zone && !zone_inactive_shortage(page->zone))
+ return;
+
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
page->age += PAGE_AGE_ADV;
@@ -131,7 +139,7 @@
}

/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
+static int swap_out_pmd(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
{
pte_t * pte;
unsigned long pmd_end;
@@ -155,7 +163,7 @@
struct page *page = pte_page(*pte);

if (VALID_PAGE(page) && !PageReserved(page)) {
- try_to_swap_out(mm, vma, address, pte, page);
+ try_to_swap_out(zone, mm, vma, address, pte, page);
if (!--count)
break;
}
@@ -168,7 +176,7 @@
}

/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
+static inline int swap_out_pgd(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -188,7 +196,7 @@
end = pgd_end;

do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count);
+ count = swap_out_pmd(zone, mm, vma, pmd, address, end, count);
if (!count)
break;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -198,7 +206,7 @@
}

/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
+static int swap_out_vma(zone_t *zone, struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
{
pgd_t *pgdir;
unsigned long end;
@@ -213,7 +221,7 @@
if (address >= end)
BUG();
do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count);
+ count = swap_out_pgd(zone, mm, vma, pgdir, address, end, count);
if (!count)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -225,7 +233,7 @@
/*
* Returns non-zero if we scanned all `count' pages
*/
-static int swap_out_mm(struct mm_struct * mm, int count)
+static int swap_out_mm(zone_t *zone, struct mm_struct * mm, int count)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -248,7 +256,7 @@
address = vma->vm_start;

for (;;) {
- count = swap_out_vma(mm, vma, address, count);
+ count = swap_out_vma(zone, mm, vma, address, count);
if (!count)
goto out_unlock;
vma = vma->vm_next;
@@ -280,7 +288,7 @@
return nr;
}

-static void swap_out(unsigned int priority, int gfp_mask)
+static void swap_out(zone_t *zone, unsigned int priority, int gfp_mask)
{
int counter;
int retval = 0;
@@ -288,7 +296,7 @@

/* Always start by trying to penalize the process that is allocating memory */
if (mm)
- retval = swap_out_mm(mm, swap_amount(mm));
+ retval = swap_out_mm(zone, mm, swap_amount(mm));

/* Then, look at the other mm's */
counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
@@ -310,7 +318,7 @@
spin_unlock(&mmlist_lock);

/* Walk about 6% of the address space each time */
- retval |= swap_out_mm(mm, swap_amount(mm));
+ retval |= swap_out_mm(zone, mm, swap_amount(mm));
mmput(mm);
} while (--counter >= 0);
return;
@@ -426,7 +434,7 @@
#define MAX_LAUNDER (4 * (1 << page_cluster))
#define CAN_DO_FS (gfp_mask & __GFP_FS)
#define CAN_DO_IO (gfp_mask & __GFP_IO)
-int page_launder(int gfp_mask, int sync)
+int do_page_launder(zone_t *zone, int gfp_mask, int sync)
{
int launder_loop, maxscan, cleaned_pages, maxlaunder;
struct list_head * page_lru;
@@ -461,6 +469,17 @@
continue;
}

+ /*
+ * If we are doing zone-specific laundering,
+ * avoid touching pages from zones which do
+ * not have a free shortage.
+ */
+ if (zone && !zone_free_shortage(page->zone)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
+ continue;
+ }
+
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
@@ -574,8 +593,13 @@
* If we're freeing buffer cache pages, stop when
* we've got enough free memory.
*/
- if (freed_page && !free_shortage())
- break;
+ if (freed_page) {
+ if (zone) {
+ if (!zone_free_shortage(zone))
+ break;
+ } else if (!free_shortage())
+ break;
+ }
continue;
} else if (page->mapping && !PageDirty(page)) {
/*
@@ -613,7 +637,8 @@
* loads, flush out the dirty pages before we have to wait on
* IO.
*/
- if (CAN_DO_IO && !launder_loop && free_shortage()) {
+ if (CAN_DO_IO && !launder_loop && (free_shortage()
+ || (zone && zone_free_shortage(zone)))) {
launder_loop = 1;
/* If we cleaned pages, never do synchronous IO. */
if (cleaned_pages)
@@ -629,6 +654,34 @@
return cleaned_pages;
}

+int page_launder(int gfp_mask, int sync)
+{
+ int type = 0;
+ int ret;
+ pg_data_t *pgdat = pgdat_list;
+ /*
+ * First do a global scan if there is a
+ * global shortage.
+ */
+ if (free_shortage())
+ ret += do_page_launder(NULL, gfp_mask, sync);
+
+ /*
+ * Then check if there is any specific zone
+ * needs laundering.
+ */
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ zone_t *zone = pgdat->node_zones + type;
+
+ if (zone_free_shortage(zone))
+ ret += do_page_launder(zone, gfp_mask, sync);
+ }
+
+ return ret;
+}
+
+
+
/**
* refill_inactive_scan - scan the active list and find pages to deactivate
* @priority: the priority at which to scan
@@ -637,7 +690,7 @@
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*/
-int refill_inactive_scan(unsigned int priority, int target)
+int refill_inactive_scan(zone_t *zone, unsigned int priority, int target)
{
struct list_head * page_lru;
struct page * page;
@@ -665,6 +718,16 @@
continue;
}

+ /*
+ * If we are doing zone-specific scanning, ignore
+ * pages from zones without shortage.
+ */
+
+ if (zone && !zone_inactive_shortage(page->zone)) {
+ page_active = 1;
+ goto skip_page;
+ }
+
/* Do aging on the pages. */
if (PageTestandClearReferenced(page)) {
age_page_up_nolock(page);
@@ -694,6 +757,7 @@
* to the other end of the list. Otherwise we exit if
* we have done enough work.
*/
+skip_page:
if (page_active || PageActive(page)) {
list_del(page_lru);
list_add(page_lru, &active_list);
@@ -709,12 +773,10 @@
}

/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
+ * Check if we have are low on free pages globally.
*/
int free_shortage(void)
{
- pg_data_t *pgdat = pgdat_list;
int sum = 0;
int freeable = nr_free_pages() + nr_inactive_clean_pages();
int freetarget = freepages.high;
@@ -722,6 +784,22 @@
/* Are we low on free pages globally? */
if (freeable < freetarget)
return freetarget - freeable;
+ return 0;
+}
+
+/*
+ *
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int total_free_shortage(void)
+{
+ int sum = 0;
+ pg_data_t *pgdat = pgdat_list;
+
+ /* Do we have a global free shortage? */
+ if((sum = free_shortage()))
+ return sum;

/* If not, are we very low on any particular zone? */
do {
@@ -739,15 +817,15 @@
} while (pgdat);

return sum;
+
}

/*
- * How many inactive pages are we short?
+ * How many inactive pages are we short globally?
*/
int inactive_shortage(void)
{
int shortage = 0;
- pg_data_t *pgdat = pgdat_list;

/* Is the inactive dirty list too small? */

@@ -759,10 +837,20 @@

if (shortage > 0)
return shortage;
+ return 0;
+}
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int total_inactive_shortage(void)
+{
+ int shortage = 0;
+ pg_data_t *pgdat = pgdat_list;

- /* If not, do we have enough per-zone pages on the inactive list? */
+ if((shortage = inactive_shortage()))
+ return shortage;

- shortage = 0;
+ shortage = 0;

do {
int i;
@@ -802,7 +890,7 @@
* when called from a user process.
*/
#define DEF_PRIORITY (6)
-static int refill_inactive(unsigned int gfp_mask, int user)
+static int refill_inactive_global(unsigned int gfp_mask, int user)
{
int count, start_count, maxtry;

@@ -824,9 +912,9 @@
}

/* Walk the VM space for a bit.. */
- swap_out(DEF_PRIORITY, gfp_mask);
+ swap_out(NULL, DEF_PRIORITY, gfp_mask);

- count -= refill_inactive_scan(DEF_PRIORITY, count);
+ count -= refill_inactive_scan(NULL, DEF_PRIORITY, count);
if (count <= 0)
goto done;

@@ -839,6 +927,60 @@
return (count < start_count);
}

+static int refill_inactive_zone(zone_t *zone, unsigned int gfp_mask, int user)
+{
+ int count, start_count, maxtry;
+
+ count = start_count = zone_inactive_shortage(zone);
+
+ maxtry = (1 << DEF_PRIORITY);
+
+ do {
+ swap_out(zone, DEF_PRIORITY, gfp_mask);
+
+ count -= refill_inactive_scan(zone, DEF_PRIORITY, count);
+
+ if (count <= 0)
+ goto done;
+
+ if (--maxtry <= 0)
+ return 0;
+
+ } while(zone_inactive_shortage(zone));
+done:
+ return (count < start_count);
+}
+
+
+static int refill_inactive(unsigned int gfp_mask, int user)
+{
+ int type = 0;
+ int ret;
+ pg_data_t *pgdat = pgdat_list;
+ /*
+ * First do a global scan if there is a
+ * global shortage.
+ */
+ if (inactive_shortage())
+ ret += refill_inactive_global(gfp_mask, user);
+
+ /*
+ * Then check if there is any specific zone
+ * with a shortage and try to refill it if
+ * so.
+ */
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ zone_t *zone = pgdat->node_zones + type;
+
+ if (zone_inactive_shortage(zone))
+ ret += refill_inactive_zone(zone, gfp_mask, user);
+ }
+
+ return ret;
+}
+
+#define DEF_PRIORITY (6)
+
static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
int ret = 0;
@@ -851,8 +993,10 @@
* before we get around to moving them to the other
* list, so this is a relatively cheap operation.
*/
- if (free_shortage()) {
- ret += page_launder(gfp_mask, user);
+
+ ret += page_launder(gfp_mask, user);
+
+ if (total_free_shortage()) {
shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
shrink_icache_memory(DEF_PRIORITY, gfp_mask);
}
@@ -861,8 +1005,7 @@
* If needed, we move pages from the active list
* to the inactive list.
*/
- if (inactive_shortage())
- ret += refill_inactive(gfp_mask, user);
+ ret += refill_inactive(gfp_mask, user);

/*
* Reclaim unused slab cache if memory is low.
@@ -917,7 +1060,7 @@
static long recalc = 0;

/* If needed, try to free some memory. */
- if (inactive_shortage() || free_shortage())
+ if (total_inactive_shortage() || total_free_shortage())
do_try_to_free_pages(GFP_KSWAPD, 0);

/* Once a second ... */
@@ -928,7 +1071,7 @@
recalculate_vm_stats();

/* Do background page aging. */
- refill_inactive_scan(DEF_PRIORITY, 0);
+ refill_inactive_scan(NULL, DEF_PRIORITY, 0);
}

run_task_queue(&tq_disk);
@@ -944,7 +1087,7 @@
* We go to sleep for one second, but if it's needed
* we'll be woken up earlier...
*/
- if (!free_shortage() || !inactive_shortage()) {
+ if (!total_free_shortage() || !total_inactive_shortage()) {
interruptible_sleep_on_timeout(&kswapd_wait, HZ);
/*
* If we couldn't free enough memory, we see if it was



2001-07-18 02:06:59

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
>
> This fixes most of the highmem problems (I'm not able to deadlock a 4GB
> machine running memory-intensive programs with the patch anymore. I've
> also received one success report from Dirk Wetter running two 2GB
> simulations on a 4GB machine).

Do you have any really compelling reasons for adding the zone parameter to
swap-out?

At worst, we get a few more page-faults (not IO). At best, NOT doing this
should generate a more complete picture of the VM state. I'd really prefer
the VM scanning to not be zone-aware..

Linus

2001-07-18 02:51:12

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Tue, 17 Jul 2001, Linus Torvalds wrote:

>
> On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
> >
> > This fixes most of the highmem problems (I'm not able to deadlock a 4GB
> > machine running memory-intensive programs with the patch anymore. I've
> > also received one success report from Dirk Wetter running two 2GB
> > simulations on a 4GB machine).
>
> Do you have any really compelling reasons for adding the zone parameter to
> swap-out?

Avoid the page-faults and unecessary swap space allocation.

> At worst, we get a few more page-faults (not IO).

Don't think its just a "few more" depending on the setup... I've seen
"__get_swap_page()" using 99%CPU time of the system due to DMA specific
inactive shortage while the kernel was aging/unmapping pte's pointing to
normal/highmem pages during quite some time. As soon as the DMA inactive
shortage is gone, the problem goes away.

That is the main reason why I did zone specific pte scanning.

> At best, NOT doing this should generate a more complete picture of the
> VM state.

Indeed. Thats the price we have to pay...

> I'd really prefer the VM scanning to not be zone-aware..

Right, but think about small/big zones on the same machine.

If we have a _specific_ inactive shortage on the DMA zone on a highmem
machine with shitloads of memory, its not worth to potentially unmap
all pte's pointing to all high/normal memory.

Practical example: 4GB machine, running two "fillmem" (2GB each).

The following stats are for DMA specific "swap_out()" calls.

vm_pteskipzone 2534665 <-- Number of pte's skipped because they pointed to
non-DMA zones.
vm_ptescan 13984 <-- Number of pte's pointing to DMA pages scanned.
vm_pteunmap 6320 <-- From "vm_ptescan", how many pte's have been
succesfully unmapped.

Now imagine that on a 16GB machine. Its a big storm of unecessary
softfaults/swap space allocation.

Its a tradeoff: I think the unecessary pte unmap's are a bigger problem
than the "not complete picture" of the VM state.



2001-07-18 03:57:43

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
> >
> > Do you have any really compelling reasons for adding the zone parameter to
> > swap-out?
>
> Avoid the page-faults and unecessary swap space allocation.

In that case, what's the argument for not just replacing the zone
parameter with

/* If we have enough free pages in this zone, don't bother */
if (page->zone->nrpages > page->zone->high)
return;

which works without having a silly single-zone special case (think
multiple small zones, all under pressure, and one large zone that hasn't
seen pressure in ages).

A single-zone parameter just looks fundamentally broken. How do you
determine "which zone"? All allocations are really about zone _lists_, not
single zones.

This same test (maybe nicely abstraced with something like a
"page_zone_pressure(page)" inline function) makes sense in pretty much all
the scanning functions. We want to _age_ the pages in such zones, but we
may not actually want to do anything further.

Comments?

Linus

2001-07-18 04:27:37

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Tue, 17 Jul 2001, Linus Torvalds wrote:

>
> On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
> > >
> > > Do you have any really compelling reasons for adding the zone parameter to
> > > swap-out?
> >
> > Avoid the page-faults and unecessary swap space allocation.
>
> In that case, what's the argument for not just replacing the zone
> parameter with
>
> /* If we have enough free pages in this zone, don't bother */
> if (page->zone->nrpages > page->zone->high)
> return;

Inactive shortage and free shortage are different things.

Think about the case where you have high inactive shortage and no free
shortage at all.

It is a valid and real case.

> which works without having a silly single-zone special case (think
> multiple small zones, all under pressure, and one large zone that hasn't
> seen pressure in ages).

There is no problem with such a case. (if that is what you mean)

/*
* If we are doing a zone-specific scan, do not
* touch pages from zones which don't have a
* shortage.
*/
if (zone && !zone_inactive_shortage(page->zone))
return;

We will always do pte aging for zones which have an inactive shortage, if
doing zone specific scanning or not.

> A single-zone parameter just looks fundamentally broken.

The "zone" parameter passed to swap_out() means "don't unmap pte's mapping
to pages belonging to not-under-shortage zones". It can (and it should) be
replaced by a "zone_specific" parameter.

> How do you determine "which zone"? All allocations are really about
> zone _lists_, not single zones.
>
> This same test (maybe nicely abstraced with something like a
> "page_zone_pressure(page)" inline function) makes sense in pretty much all
> the scanning functions. We want to _age_ the pages in such zones, but we
> may not actually want to do anything further.
>
> Comments?

I haven't exactly understood what you mean here, but I suppose changing
"zone_t *zone" to "int zone_specific" paramater answers your question,
right ?

Or I'm missing something ?

2001-07-18 04:56:31

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
>
> > A single-zone parameter just looks fundamentally broken.
>
> The "zone" parameter passed to swap_out() means "don't unmap pte's mapping
> to pages belonging to not-under-shortage zones". It can (and it should) be
> replaced by a "zone_specific" parameter.

Ahh.

In fact, it should be replaced by a single bit.

Passing in a "zone *" and then using it purely as a boolean makes no
sense.

But that still makes me ask: why do you have that (misnamed, and
mis-typed) boolean there in the first place? Why not just unconditionally
have the "zone_shortage(page->zone)"?

Linus

2001-07-18 05:18:16

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Tue, 17 Jul 2001, Linus Torvalds wrote:

>
> On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
> >
> > > A single-zone parameter just looks fundamentally broken.
> >
> > The "zone" parameter passed to swap_out() means "don't unmap pte's mapping
> > to pages belonging to not-under-shortage zones". It can (and it should) be
> > replaced by a "zone_specific" parameter.
>
> Ahh.
>
> In fact, it should be replaced by a single bit.
>
> Passing in a "zone *" and then using it purely as a boolean makes no
> sense.

Right.

> But that still makes me ask: why do you have that (misnamed, and
> mis-typed) boolean there in the first place?

Because I thought about doing something like:

/* Avoid touching pages from zones which
* are not from the zone being scanned
*/
if (page->zone != zone)
return;

But then I figured out that its stupid.

I ended up using the "zone_t *zone" as a boolean and forgot to change it
before sending the patch.

> Why not just unconditionally have the "zone_shortage(page->zone)"?

Because I tried to avoid strict perzone shortage handling, keeping the
global scanning to have _some_ "fair" aging between the zones.

The active/inactive dirty lists are shared by all zones, and page position
there is a method of page age indication.

So in most cases we are "fair" wrt list position and do global scanning.
Now if there is a real need, we do perzone scanning.

Comments?

2001-07-18 13:23:33

by Rik van Riel

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Tue, 17 Jul 2001, Marcelo Tosatti wrote:

> The following patch (against 2.4.6-ac2, already merged in 2.4.6-ac3) adds
> specific perzone inactive/free shortage handling code.

Marcelo, now that you have the nice VM statistics
patch, do you have some numbers on how this patch
affects the system, or is this patch based on
guesswork ? ;)

Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...

http://www.surriel.com/ http://distro.conectiva.com/

Send all your spam to [email protected] (spam digging piggy)

2001-07-18 13:28:13

by Rik van Riel

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Tue, 17 Jul 2001, Linus Torvalds wrote:

> In that case, what's the argument for not just replacing the zone
> parameter with
>
> /* If we have enough free pages in this zone, don't bother */
> if (page->zone->nrpages > page->zone->high)
> return;

> Comments?

Won't work. If it did, it'd just bring us back to the
pathetic situation we had in 2.3.51, but with the
introduction of inactive_clean pages and an inactive
target all this test would do is either preventing
us from ever making the inactive target or from getting
the eviction balancing between zones right (see 2.3.51).

regards,

Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...

http://www.surriel.com/ http://distro.conectiva.com/

Send all your spam to [email protected] (spam digging piggy)

2001-07-18 16:32:17

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
>
> I ended up using the "zone_t *zone" as a boolean and forgot to change it
> before sending the patch.

Ok.

My main worry really is that I absolutely detest special cases. Especially
special cases that just make the code uglier.

If it is right and necessary to _sometimes_ take zone inactive shortage
into account, why not do it always?

I realize that this makes a difference for how you'd do the test. If you
do it sometimes, you have something like

if (shortage_only && !inactive_shortage(page->zone))
return;

while if you decide that it is actually ok to always have this heuristic
you'd probably write it

if (inactive_plenty(page->zone))
return;

instead. See the difference? The first one says "this time we're only
interested in zones that have shortage". The second one says "in general,
if we have plenty of inactive pages in this zone, we don't want to bother
with it".

The reason I'd much prefer the latter is:
- code that doesn't have special cases is more likely to be correct and
have good behaviour over a wide variety of loads - simply because it
gets tested under _all_ loads, not just the loads that trigger the
special cases
- code like the above means that we can more gradually approach the state
of some zone shortage. We can do background shortage scanning, and
nicely handle the case where we're not actually _short_ on any zone,
but some zones are getting close to being short. Which should make the
load smoother.

> Because I tried to avoid strict perzone shortage handling, keeping the
> global scanning to have _some_ "fair" aging between the zones.

Sure. But at the same time, if some zone has tons of memory and another
zone doesn't, then it is ok to say "we can ignore the zone with lots of
memory for now".

Yes, it's "unfair". Yes, it will cause the tight zone to be aged out
quicker. But yes, that's actually what we want.

Think something more NUMA-like for example - imagine walking a VM tree
where the process has pages mapped from multiple nodes. At the same time,
because of node affinity, some nodes would end up being under higher
memory pressure because of the processes they are running. Do we want to
age the pages on those nodes faster? Sure.

And we do NOT want to get into the situation that one zone/node ends up
being close to the shortage line all the time, and then when it crosses
over we have a clear "behaviour change".

Changing behaviour like that is bad.

Linus

2001-07-18 18:24:02

by Mike Galbraith

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wed, 18 Jul 2001, Rik van Riel wrote:

> On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
>
> > The following patch (against 2.4.6-ac2, already merged in 2.4.6-ac3) adds
> > specific perzone inactive/free shortage handling code.
>
> Marcelo, now that you have the nice VM statistics
> patch, do you have some numbers on how this patch
> affects the system, or is this patch based on
> guesswork ? ;)

Have you read Dirk's logs or read the pertinent threads at all?

-Mike

2001-07-18 18:39:34

by Rik van Riel

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wed, 18 Jul 2001, Mike Galbraith wrote:
> On Wed, 18 Jul 2001, Rik van Riel wrote:

> > Marcelo, now that you have the nice VM statistics
> > patch, do you have some numbers on how this patch
> > affects the system, or is this patch based on
> > guesswork ? ;)
>
> Have you read Dirk's logs or read the pertinent threads at all?

Yes. Read the one from "Mon, 16 Jul 2001 15:14:02 -0400 (EDT)", the
one where Dirk replies to his own email with "wishful thinking ;-(".

regards,

Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...

http://www.surriel.com/ http://distro.conectiva.com/

Send all your spam to [email protected] (spam digging piggy)


2001-07-18 19:26:54

by Mike Galbraith

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wed, 18 Jul 2001, Rik van Riel wrote:

> On Wed, 18 Jul 2001, Mike Galbraith wrote:
> > On Wed, 18 Jul 2001, Rik van Riel wrote:
>
> > > Marcelo, now that you have the nice VM statistics
> > > patch, do you have some numbers on how this patch
> > > affects the system, or is this patch based on
> > > guesswork ? ;)
> >
> > Have you read Dirk's logs or read the pertinent threads at all?
>
> Yes. Read the one from "Mon, 16 Jul 2001 15:14:02 -0400 (EDT)", the
> one where Dirk replies to his own email with "wishful thinking ;-(".

You didn't paying enough attention. Marcelo is hot on the trail
of a problem.

-Mike

2001-07-18 20:02:36

by Rik van Riel

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wed, 18 Jul 2001, Mike Galbraith wrote:

> You didn't paying enough attention. Marcelo is hot on the trail
> of a problem.

Looks like it indeed, however I think it's a shame
he isn't showing us the numbers from his nice VM
statistics patch.

Having those numbers we could all help think about
a solution while now we can do nothing but look at
the philosophical side fo marcelo's patch ;)

Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...

http://www.surriel.com/ http://distro.conectiva.com/

Send all your spam to [email protected] (spam digging piggy)

2001-07-18 21:22:15

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Rik van Riel wrote:

> On Tue, 17 Jul 2001, Marcelo Tosatti wrote:
>
> > The following patch (against 2.4.6-ac2, already merged in 2.4.6-ac3) adds
> > specific perzone inactive/free shortage handling code.
>
> Marcelo, now that you have the nice VM statistics
> patch, do you have some numbers on how this patch
> affects the system,

Yes.

With the old code, I've seen zone specific shortages which caused the
kernel to free/deactivate pages from all zones.

> or is this patch based on guesswork ? ;)

Even if I did not had the stats, its senseless to free/deactivate pages
from zones which do not need to.

The old behaviour was fundamentally broken.

2001-07-18 21:38:01

by Rik van Riel

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wed, 18 Jul 2001, Marcelo Tosatti wrote:

> Even if I did not had the stats, its senseless to free/deactivate pages
> from zones which do not need to.
>
> The old behaviour was fundamentally broken.

Both behaviours are fundamentally broken. On the one hand
we WILL want to deactivate the least used pages so for
GFP_HIGHUSER allocations we know which zone to allocate
from.

On the other hand we want to avoid work in the pageout
code when it isn't needed.

regards,

Rik
--
Virtual memory is like a game you can't win;
However, without VM there's truly nothing to lose...

http://www.surriel.com/ http://distro.conectiva.com/

Send all your spam to [email protected] (spam digging piggy)

2001-07-18 22:17:29

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Linus Torvalds wrote:

>
> On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> >
> > I ended up using the "zone_t *zone" as a boolean and forgot to change it
> > before sending the patch.
>
> Ok.
>
> My main worry really is that I absolutely detest special cases. Especially
> special cases that just make the code uglier.
>
> If it is right and necessary to _sometimes_ take zone inactive shortage
> into account, why not do it always?
>
> I realize that this makes a difference for how you'd do the test. If you
> do it sometimes, you have something like
>
> if (shortage_only && !inactive_shortage(page->zone))
> return;
>
> while if you decide that it is actually ok to always have this heuristic
> you'd probably write it
>
> if (inactive_plenty(page->zone))
> return;
>
> instead. See the difference? The first one says "this time we're only
> interested in zones that have shortage". The second one says "in general,
> if we have plenty of inactive pages in this zone, we don't want to bother
> with it".
>
> The reason I'd much prefer the latter is:
> - code that doesn't have special cases is more likely to be correct and
> have good behaviour over a wide variety of loads - simply because it
> gets tested under _all_ loads, not just the loads that trigger the
> special cases
> - code like the above means that we can more gradually approach the state
> of some zone shortage. We can do background shortage scanning, and
> nicely handle the case where we're not actually _short_ on any zone,
> but some zones are getting close to being short. Which should make the
> load smoother.
>
> > Because I tried to avoid strict perzone shortage handling, keeping the
> > global scanning to have _some_ "fair" aging between the zones.
>
> Sure. But at the same time, if some zone has tons of memory and another
> zone doesn't, then it is ok to say "we can ignore the zone with lots of
> memory for now".
>
> Yes, it's "unfair". Yes, it will cause the tight zone to be aged out
> quicker. But yes, that's actually what we want.
>
> Think something more NUMA-like for example - imagine walking a VM tree
> where the process has pages mapped from multiple nodes. At the same time,
> because of node affinity, some nodes would end up being under higher
> memory pressure because of the processes they are running. Do we want to
> age the pages on those nodes faster? Sure.
>
> And we do NOT want to get into the situation that one zone/node ends up
> being close to the shortage line all the time, and then when it crosses
> over we have a clear "behaviour change".
>
> Changing behaviour like that is bad.

Ok, I understand and I agree with doing _unconditional_
"zone_inactive_plenty()" instead of conditional
"zone_inactive_shortage()".

This way we do not get _strict_ zoned behaviour (with strict I mean only
doing scanning for zones which have a shortage), making the shortage
handling smoother and doing "fair" aging in cases where there are not
specific zones under pressure.


2001-07-18 22:24:28

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
>
> Ok, I understand and I agree with doing _unconditional_
> "zone_inactive_plenty()" instead of conditional
> "zone_inactive_shortage()".
>
> This way we do not get _strict_ zoned behaviour (with strict I mean only
> doing scanning for zones which have a shortage), making the shortage
> handling smoother and doing "fair" aging in cases where there are not
> specific zones under pressure.

Cool.

Willing to write a patch and give it some preliminary testing? I also
agree with the patch Rik sent in about GFP_HIGHUSER, that's orthogonal
though (even if I suspect it could also have made the problem _appear_
much much more clearly).

I'd like to do a real pre7 one of these days (it's already growing big
enough, thank you), but I'd love to have this issue put at least somewhat
to rest.

Linus

2001-07-18 22:30:30

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Linus Torvalds wrote:

>
> On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> >
> > Ok, I understand and I agree with doing _unconditional_
> > "zone_inactive_plenty()" instead of conditional
> > "zone_inactive_shortage()".
> >
> > This way we do not get _strict_ zoned behaviour (with strict I mean only
> > doing scanning for zones which have a shortage), making the shortage
> > handling smoother and doing "fair" aging in cases where there are not
> > specific zones under pressure.
>
> Cool.
>
> Willing to write a patch and give it some preliminary testing?

Sure. However its not _that_ easy. We do have a global inactive target.

There is no perzone inactive shortage, which is needed to calculate
"zone_inactive_plenty()".

> I also agree with the patch Rik sent in about GFP_HIGHUSER, that's
> orthogonal though (even if I suspect it could also have made the
> problem _appear_ much much more clearly).

Right.

2001-07-18 22:51:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> >
> > Cool.
> >
> > Willing to write a patch and give it some preliminary testing?
>
> Sure. However its not _that_ easy. We do have a global inactive target.

Absolutely. But this is why it's easier to have the more relaxed
constraints - we have to make sure that the global inactive target is
clearly lower than the sum of the relaxed local targets.

That way, whenever there is a global inactive need, we _clearly_ have one
or more zones (usually _all_ of them with any reasonably balanced system:
and note how this whole logic will strive to _add_ balance) that will
trigger the test, and there is no worry that we get into the nasty case
where we suddenly start to try to overly cannibalize one zone horribly.

> There is no perzone inactive shortage, which is needed to calculate
> "zone_inactive_plenty()".

Right. But the gobal inactivity shortage can certainly be a clue to how to
do this.

So when the global shortage is effectively

MAX of

(freepages.high + inactive_target) - nr_free_pages - inactive_clean - inactive_dirty

or

per-zone shortage.

So for this discussion we can ignore the per-zone shortage case (because
_obviously_ the per-zone "inactive_plenty()" cannot be a shortage of
inactive ;), and only concentrate on making sure that the sum of the
per-zone inactive_plenty decisions is noticeably more than the global
shortage (for example, by a factor of two, or something like that). So one
suggestion would be to take the same logic as the global shortage, but
apply it to just the local zone, and then multiply by two (as the slop to
make sure that we don't every under-estimate).

So something like

inactive_plenty(zone)
{
if (!zone->nrpages)
return 0;
shortage = zone->pages_high;
shortage -= zone->inactive_dirty_pages;
shortage -= zone->inactive_clean_pages;
shortage -= zone->free_pages;

/* zone inactive-target is 1/2 of the number of pages */
return shortage < zone->nrpages / 2;
}

(Notice how the "global inactive target" is at most 1/4 of all the
available memory - so now we make the per-zone "inactive target" half of
the zone memory, which means that there's no way we can return "plenty of
inactive" from all zones when we're on global shortage).

Also note how being generous here means that we're not changing the
existing setup all that much - inactive_plenty() will start to refuse
zones only if there really is PLENTY of inactive pages, so we are also
being very conservative here - we're changing existing behaviour only in
cases where we clearly have a balancing problem.

And being conservative during a 2.4.x release is a good thing.

Linus

2001-07-18 22:53:24

by Daniel Phillips

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Wednesday 18 July 2001 18:30, Linus Torvalds wrote:
> My main worry really is that I absolutely detest special cases.
> Especially special cases that just make the code uglier.
>
> If it is right and necessary to _sometimes_ take zone inactive
> shortage into account, why not do it always?
>
> I realize that this makes a difference for how you'd do the test. If
> you do it sometimes, you have something like
>
> if (shortage_only && !inactive_shortage(page->zone))
> return;
>
> while if you decide that it is actually ok to always have this
> heuristic you'd probably write it
>
> if (inactive_plenty(page->zone))
> return;
>
> instead. See the difference? The first one says "this time we're only
> interested in zones that have shortage". The second one says "in
> general, if we have plenty of inactive pages in this zone, we don't
> want to bother with it".

I see it as a continuum, IOW a signed value, so:

inactive_plenty() == -inactive_shortage()

And from each zone we want to deactivate in proportion to:

inactive_shortage_zone(zone) / inactive_shortage_total()

I don't really see much use for inactive_shortage_total() by itself,
except maybe deciding when to scan vs sitting idle.

> The reason I'd much prefer the latter is:
> - code that doesn't have special cases is more likely to be correct
> and have good behaviour over a wide variety of loads - simply because
> it gets tested under _all_ loads, not just the loads that trigger the
> special cases
> - code like the above means that we can more gradually approach the
> state of some zone shortage. We can do background shortage scanning,
> and nicely handle the case where we're not actually _short_ on any
> zone, but some zones are getting close to being short. Which should
> make the load smoother.

Yes, and it can be even more like that, it's analog thinking instead
of digital.

> On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> > Because I tried to avoid strict perzone shortage handling, keeping
> > the global scanning to have _some_ "fair" aging between the zones.
>
> Sure. But at the same time, if some zone has tons of memory and
> another zone doesn't, then it is ok to say "we can ignore the zone
> with lots of memory for now".
>
> Yes, it's "unfair". Yes, it will cause the tight zone to be aged out
> quicker. But yes, that's actually what we want.

Yes, this whole thing is really an "aha", or maybe it's more
accurate to call it a "duh". It's actually hard to come up with a
case where you don't want the per-zone aging (and laundering for
that matter, but that sort-of comes for free with this). One case
is where you have some option about where to allocate pages - say
_highmem is tight but _normal still has lots of slack. It's not
necessarily better to favor deactivation of highmem in that case.
It won't hurt either, so, my question is - how could per-zone aging
ever lead to trouble?

> Think something more NUMA-like for example - imagine walking a VM
> tree where the process has pages mapped from multiple nodes. At the
> same time, because of node affinity, some nodes would end up being
> under higher memory pressure because of the processes they are
> running. Do we want to age the pages on those nodes faster? Sure.

Getting more analog all the time.

> And we do NOT want to get into the situation that one zone/node ends
> up being close to the shortage line all the time, and then when it
> crosses over we have a clear "behaviour change".
> Changing behaviour like that is bad.

Uhuh, uhuh.

--
Daniel

2001-07-18 23:40:00

by Daniel Phillips

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Thursday 19 July 2001 00:59, Linus Torvalds wrote:
> On Thu, 19 Jul 2001, Daniel Phillips wrote:
> > I don't really see much use for inactive_shortage_total() by
> > itself, except maybe deciding when to scan vs sitting idle.
>
> Absolutely. But that's an important decision in itself. Getting that
> decision wrong means that we either scan too little (and which point
> the question of per-zone shortages becomes moot, because by the time
> we start scanning we're too deep in trouble to be able to do a good
> gradual job anyway). Or we scan too much, and then the per-zone
> shortage just means that we'll always have so much inactive stuff in
> all the zones that we'll continue scanning forever - because none of
> the zones (correctly) feel that they have any reason to actually free
> anything.

Yes, scanning too much is mainly a cpu waste, but it also has the bad
effect of degrading quality of the long term aging information
(because pages aren't fairly aged while they're on the inactive queues.)

I'm thinking about an alternative way of aging that lets ->age be a
signed value, then when you get a surge in demand for deactivation
you just raise the threshold at which deactivation takes place. This
would be in addition to scanning more, but should save a lot of cpu.
This is exactly at the time that it's good to save cpu. There's still
a lot of thinking to do about what the function of the current
exponential down-aging really is, and how to capture the good effects
with subtracts instead of shifts. Caveat: not 2.4 stuff, obviously.

> So the global inactive_shortage() decision is certainly an important
> one: it should trigger early enough to matter, but not so early that
> we trigger it even when most local zones are really totally saturated
> and we really shouldn't be scanning at all.

Yes. The inactive shortage needs to be a function of the length of
the inactive_dirty queue rather than just the amount that free pages
is less than some fixed minimum. The target length of the
inactive_dirty queue in turn can be a function of the global free
shortage (which is where the minimum free numbers get used) and the
transfer rate of the disk(s). Again, experimental - without careful
work a feedback mechanism like this could oscillate wildly. It's most
probably the way forward in the long run though.

--
Daniel

2001-07-18 23:42:40

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Linus Torvalds wrote:

>
> On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> > >
> > > Cool.
> > >
> > > Willing to write a patch and give it some preliminary testing?
> >
> > Sure. However its not _that_ easy. We do have a global inactive target.
>
> Absolutely. But this is why it's easier to have the more relaxed
> constraints - we have to make sure that the global inactive target is
> clearly lower than the sum of the relaxed local targets.
>
> That way, whenever there is a global inactive need, we _clearly_ have one
> or more zones (usually _all_ of them with any reasonably balanced system:
> and note how this whole logic will strive to _add_ balance) that will
> trigger the test, and there is no worry that we get into the nasty case
> where we suddenly start to try to overly cannibalize one zone horribly.
>
> > There is no perzone inactive shortage, which is needed to calculate
> > "zone_inactive_plenty()".
>
> Right. But the gobal inactivity shortage can certainly be a clue to how to
> do this.
>
> So when the global shortage is effectively
>
> MAX of
>
> (freepages.high + inactive_target) - nr_free_pages - inactive_clean - inactive_dirty
>
> or
>
> per-zone shortage.
>
> So for this discussion we can ignore the per-zone shortage case (because
> _obviously_ the per-zone "inactive_plenty()" cannot be a shortage of
> inactive ;), and only concentrate on making sure that the sum of the
> per-zone inactive_plenty decisions is noticeably more than the global
> shortage (for example, by a factor of two, or something like that). So one
> suggestion would be to take the same logic as the global shortage, but
> apply it to just the local zone, and then multiply by two (as the slop to
> make sure that we don't every under-estimate).
>
> So something like
>
> inactive_plenty(zone)
> {
> if (!zone->nrpages)
> return 0;
> shortage = zone->pages_high;
> shortage -= zone->inactive_dirty_pages;
> shortage -= zone->inactive_clean_pages;
> shortage -= zone->free_pages;
>
> /* zone inactive-target is 1/2 of the number of pages */
> return shortage < zone->nrpages / 2;
> }

Wait. Don't you mean:

/* True if we have enough inactive pages for this zone */

inactive_plenty(zone)
{
if (!zone->nrpages)
return 0;
inactive = zone->inactive_dirty_pages;
inactive += zone->inactive_clean_pages;
inactive += zone->free_pages

return (inactive < zone->nrpages / 2);
}

?


2001-07-18 23:45:40

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Marcelo Tosatti wrote:

>
>
> On Wed, 18 Jul 2001, Linus Torvalds wrote:
>
> >
> > On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> > > >
> > > > Cool.
> > > >
> > > > Willing to write a patch and give it some preliminary testing?
> > >
> > > Sure. However its not _that_ easy. We do have a global inactive target.
> >
> > Absolutely. But this is why it's easier to have the more relaxed
> > constraints - we have to make sure that the global inactive target is
> > clearly lower than the sum of the relaxed local targets.
> >
> > That way, whenever there is a global inactive need, we _clearly_ have one
> > or more zones (usually _all_ of them with any reasonably balanced system:
> > and note how this whole logic will strive to _add_ balance) that will
> > trigger the test, and there is no worry that we get into the nasty case
> > where we suddenly start to try to overly cannibalize one zone horribly.
> >
> > > There is no perzone inactive shortage, which is needed to calculate
> > > "zone_inactive_plenty()".
> >
> > Right. But the gobal inactivity shortage can certainly be a clue to how to
> > do this.
> >
> > So when the global shortage is effectively
> >
> > MAX of
> >
> > (freepages.high + inactive_target) - nr_free_pages - inactive_clean - inactive_dirty
> >
> > or
> >
> > per-zone shortage.
> >
> > So for this discussion we can ignore the per-zone shortage case (because
> > _obviously_ the per-zone "inactive_plenty()" cannot be a shortage of
> > inactive ;), and only concentrate on making sure that the sum of the
> > per-zone inactive_plenty decisions is noticeably more than the global
> > shortage (for example, by a factor of two, or something like that). So one
> > suggestion would be to take the same logic as the global shortage, but
> > apply it to just the local zone, and then multiply by two (as the slop to
> > make sure that we don't every under-estimate).
> >
> > So something like
> >
> > inactive_plenty(zone)
> > {
> > if (!zone->nrpages)
> > return 0;
> > shortage = zone->pages_high;
> > shortage -= zone->inactive_dirty_pages;
> > shortage -= zone->inactive_clean_pages;
> > shortage -= zone->free_pages;
> >
> > /* zone inactive-target is 1/2 of the number of pages */
> > return shortage < zone->nrpages / 2;
> > }
>
> Wait. Don't you mean:
>
> /* True if we have enough inactive pages for this zone */
>
> inactive_plenty(zone)
> {
> if (!zone->nrpages)
> return 0;
> inactive = zone->inactive_dirty_pages;
> inactive += zone->inactive_clean_pages;
> inactive += zone->free_pages
>
> return (inactive < zone->nrpages / 2);
^^
Err I mean >

> }
>
> ?

2001-07-19 01:04:04

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> >
> > Wait. Don't you mean:

Yes. Just ignore me when I show extreme signs of Alzheimers.

Linus

2001-07-18 23:00:54

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Thu, 19 Jul 2001, Daniel Phillips wrote:
>
> I don't really see much use for inactive_shortage_total() by itself,
> except maybe deciding when to scan vs sitting idle.

Absolutely. But that's an important decision in itself. Getting that
decision wrong means that we either scan too little (and which point the
question of per-zone shortages becomes moot, because by the time we start
scanning we're too deep in trouble to be able to do a good gradual job
anyway). Or we scan too much, and then the per-zone shortage just means
that we'll always have so much inactive stuff in all the zones that we'll
continue scanning forever - because none of the zones (correctly) feel
that they have any reason to actually free anything.

So the global inactive_shortage() decision is certainly an important one:
it should trigger early enough to matter, but not so early that we trigger
it even when most local zones are really totally saturated and we really
shouldn't be scanning at all.

Linus

2001-07-18 22:57:04

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Rik van Riel wrote:

> On Wed, 18 Jul 2001, Mike Galbraith wrote:
>
> > You didn't paying enough attention. Marcelo is hot on the trail
> > of a problem.
>
> Looks like it indeed, however I think it's a shame
> he isn't showing us the numbers from his nice VM
> statistics patch.

MemTotal: 900012 kB
SwapTotal: 775152 kB

Running "fillmem 1024".

Note that in general, the DMA zone is under high shortage while the kernel
is working on the normal zone (of course, its bigger).

That behaviour changes with the zoned approach: if there is no global
shortage but zone specific only shortage the kernel _will_ do work on the
zone which needs work.

r b w swpd free buff cache si so bi bo in cs us sy id
1 0 0 426500 281628 852 432444 16 155 25 158 17 32 0 1 99
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
0 0 0 2 0 12 1
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 0 0 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 3 0 0 0 11 4 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 562 45 47 419 46 5 7
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
377 324 225 76 300 0 3 316 121

r b w swpd free buff cache si so bi bo in cs us sy id
1 0 0 335228 3056 852 341540 0 0 0 12 253 575 1 12 87
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
0 0 1 0 0 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 0 129 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 0 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 0 0 0 0 0 0 29
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
53 25 28 13 37 4 0 0 0


r b w swpd free buff cache si so bi bo in cs us sy id
1 0 0 509944 3056 272 510396 98 106 210 306 260 334 1 21 79
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
5 1 108 2 1 0 27
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 0 129 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 1642 0 0 0 0 1642 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 236302 17670 30 201475 16872 68 22
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
34648 33889 77114 636 33812 200 1 118400 42045

r b w swpd free buff cache si so bi bo in cs us sy id
0 1 2 471872 2544 216 462360 0 10654 2 10708 343 542 1 10 89
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
4 4 18 332 3 76 18
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 138 1642 1642
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 81667 17994 2331 41469 19618 403 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
28879 28867 92 3870 25003 6 329 16572 16491


r b w swpd free buff cache si so bi bo in cs us sy id
1 1 1 607020 2544 244 542816 34 30898 322 30926 527 7558 1 7 92
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
10 10 15 390 0 3542 151
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 3471 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 62712 7751 7720 38225 7996 958 3
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
25581 25441 216 6722 18837 23 777 17274 17199


Do you see a DMA free and inactive shortage (causing a lot reclaim_page()
failures (recfail field)) and the kernel aging/unmapping/laundering normal
zone pages ?


r b w swpd free buff cache si so bi bo in cs us sy id
0 3 1 610412 2544 376 504444 186 30950 1244 31064 597 6933 0 6 94
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
22 22 15 425 0 3088 331
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
1642 1642 0 0 1642 0 2974 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 144379 6638 9992 117916 8302 1190 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
27446 27421 13227 8025 19421 1 820 13745 542


r b w swpd free buff cache si so bi bo in cs us sy id
1 1 2 610328 2544 472 440108 228 31238 846 31298 574 9233 0 8 92
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
17 17 18 462 0 4401 238
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 4179 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 107342 7447 9510 79205 8364 1258 2
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
28298 28068 19713 8903 19395 1 897 19629 149
r b w swpd free buff cache si so bi bo in cs us sy id
1 0 1 610328 2556 484 433224 0 26072 8 26186 578 1604 12 2 86
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
15 15 20 0 0 391 194
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 64 0 0 0 379 64 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 35576 7482 6884 14220 6866 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
19709 19653 19544 7007 12702 0 0 19518 30
r b w swpd free buff cache si so bi bo in cs us sy id
0 1 1 610332 2672 508 432924 0 4216 42 4222 326 738 10 3 87
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
5 5 7 0 0 23 90
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
1642 423 1219 0 1642 0 21 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 19051 1191 1491 12808 1255 2319 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
10232 6463 8453 3252 6981 0 0 4685 2
r b w swpd free buff cache si so bi bo in cs us sy id
0 2 1 745672 2548 328 569368 558 1934 564 1976 226 259 2 14 84
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
12 12 98 0 0 2 187
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
4925 4502 2129 0 4925 0 2 3347 1642
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 9070 2628 1105 4375 759 16 125
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
144232 106821 120225 3703 140524 5 0 134754 51940
r b w swpd free buff cache si so bi bo in cs us sy id
0 2 1 745676 2544 360 568628 1264 3630 1730 3692 234 608 1 1 99
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
7 7 2 0 0 305 118
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 199 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 40894 919 1454 37376 1056 0 1488
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
6194 6083 131 2106 4088 0 0 1691 1672
r b w swpd free buff cache si so bi bo in cs us sy id
0 1 1 745664 2548 348 568640 1064 4150 1068 4152 208 358 0 0 100
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
1 1 1 0 0 165 9
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 97 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 5250 929 1083 2182 1055 0 3506
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
4005 4001 12 912 3094 0 0 12 5
r b w swpd free buff cache si so bi bo in cs us sy id
0 2 1 745664 2544 348 568644 1250 3838 1250 3838 194 426 0 0 100
launder launder_w ref_inact alloc_r kswapd_w krec_w kflush_w
0 0 0 0 0 197 0
Zone fshort ishort scan clean skipl skipd launder react rescue
DMA 1 257 0 0 0 0 0 0 0
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 109 0 0
Zone fshort ishort scan clean skipl skipd launder react rescue
Normal 0 0 1919 8 952 0 967 0 494
agescan agedown ageup deact deactf_age deactf_ref recfail ptes pteu
0 0 0 0 0 0 0 0 0

2001-07-19 09:02:11

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Wed, 18 Jul 2001, Linus Torvalds wrote:

>
> On Wed, 18 Jul 2001, Marcelo Tosatti wrote:
> > >
> > > Wait. Don't you mean:
>
> Yes. Just ignore me when I show extreme signs of Alzheimers.

Ok.

Well, here is a patch on top of -ac5 (which already includes the first
zoned based approach patch).

I changed inactive_plenty() to use "zone->size / 3" instead "zone->size /
2".

Under _my_ tests using half of the perzone total pages as the inactive
target was too high.

I also changed refill_inactive_scan() to return the number of deactivated
pages for the zone which we are scanning (if perzone scanning is being
done at all, of course). This avoids miscalculations of the number of
deactivated pages.

I'm still using a zone pointer as a boolean in try_to_swap_out(). Reason:
Its 6am in the morning, I already generated the patch, and the cab is down
there waiting.


Comments?

diff -Nur --exclude-from=/home/marcelo/exclude linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h Thu Jul 19 05:40:34 2001
+++ linux/include/linux/swap.h Wed Jul 18 23:19:05 2001
@@ -131,6 +131,7 @@

extern unsigned int zone_free_shortage(zone_t *zone);
extern unsigned int zone_inactive_shortage(zone_t *zone);
+extern unsigned int zone_inactive_plenty(zone_t *zone);

/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *);
diff -Nur --exclude-from=/home/marcelo/exclude linux.orig/mm/page_alloc.c linux/mm/page_alloc.c
--- linux.orig/mm/page_alloc.c Thu Jul 19 05:40:34 2001
+++ linux/mm/page_alloc.c Thu Jul 19 05:41:39 2001
@@ -707,6 +707,20 @@
return sum;
}

+unsigned int zone_inactive_plenty(zone_t *zone)
+{
+ int inactive;
+
+ if (!zone->size)
+ return 0;
+
+ inactive = zone->inactive_dirty_pages;
+ inactive += zone->inactive_clean_pages;
+ inactive += zone->free_pages;
+
+ return (inactive > (zone->size / 3));
+
+}
unsigned int zone_inactive_shortage(zone_t *zone)
{
int sum = 0;
diff -Nur --exclude-from=/home/marcelo/exclude linux.orig/mm/vmscan.c linux/mm/vmscan.c
--- linux.orig/mm/vmscan.c Thu Jul 19 05:40:34 2001
+++ linux/mm/vmscan.c Thu Jul 19 05:48:18 2001
@@ -46,7 +46,7 @@
* touch pages from zones which don't have a
* shortage.
*/
- if (zone && !zone_inactive_shortage(page->zone))
+ if (zone_inactive_plenty(page->zone))
return;

/* Don't look at this pte if it's been accessed recently. */
@@ -637,8 +637,7 @@
* loads, flush out the dirty pages before we have to wait on
* IO.
*/
- if (CAN_DO_IO && !launder_loop && (free_shortage()
- || (zone && zone_free_shortage(zone)))) {
+ if (CAN_DO_IO && !launder_loop && total_free_shortage()) {
launder_loop = 1;
/* If we cleaned pages, never do synchronous IO. */
if (cleaned_pages)
@@ -718,11 +717,11 @@
}

/*
- * If we are doing zone-specific scanning, ignore
- * pages from zones without shortage.
+ * Do not deactivate pages from zones which
+ * have plenty inactive pages.
*/

- if (zone && !zone_inactive_shortage(page->zone)) {
+ if (zone_inactive_plenty(page->zone)) {
page_active = 1;
goto skip_page;
}
@@ -756,12 +755,13 @@
* to the other end of the list. Otherwise we exit if
* we have done enough work.
*/
-skip_page:
if (page_active || PageActive(page)) {
+skip_page:
list_del(page_lru);
list_add(page_lru, &active_list);
} else {
- nr_deactivated++;
+ if (!zone || (zone && (zone == page->zone)))
+ nr_deactivated++;
if (target && nr_deactivated >= target)
break;
}

2001-07-19 16:29:44

by Daniel Phillips

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch

On Thursday 19 July 2001 01:42, Daniel Phillips wrote:
> Yes. The inactive shortage needs to be a function of the length of
> the inactive_dirty queue rather than just the amount that free pages
> is less than some fixed minimum.

Oops, it already is, good :-]

> The target length of the
> inactive_dirty queue in turn can be a function of the global free
> shortage (which is where the minimum free numbers get used)

ditto, it's already that way...

> and the transfer rate of the disk(s).

This we don't do, and afaics, this is the only way to get stability
across a really wide range of loads, and on system configurations we
can't possibly pre-tune for.

> Again, experimental - without careful
> work a feedback mechanism like this could oscillate wildly. It's
> most probably the way forward in the long run though.
>
> --
> Daniel

2001-07-19 17:13:47

by Linus Torvalds

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch


On Thu, 19 Jul 2001, Marcelo Tosatti wrote:
>
> Well, here is a patch on top of -ac5 (which already includes the first
> zoned based approach patch).

Looks ok.

I'd like to see what the patch looks on top of a virgin tree, as it should
now be noticeably smaller (no need to pas extra parameters etc).

> I changed inactive_plenty() to use "zone->size / 3" instead "zone->size /
> 2".
>
> Under _my_ tests using half of the perzone total pages as the inactive
> target was too high.

This is one of the reasons I'd like to see the virtgin patch - if the "/2"
is too high, then that should mean that the behaviour is basically
unchanged from before, right? Which would be a good sign that this kicks
in gently - and I agree that "/3" sounds saner (or even "/4" - but we
should double-check that the global inactive function is guaranteed to
never trigger with all zones close to the "/4" target if so).

I haven't checked what your changes to "total_free_shortage()" are in the
-ac tree, so I don't know what the effect of that would be.

Linus

2001-07-21 04:26:20

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: Inclusion of zoned inactive/free shortage patch



On Thu, 19 Jul 2001, Linus Torvalds wrote:

>
> On Thu, 19 Jul 2001, Marcelo Tosatti wrote:
> >
> > Well, here is a patch on top of -ac5 (which already includes the first
> > zoned based approach patch).
>
> Looks ok.
>
> I'd like to see what the patch looks on top of a virgin tree, as it should
> now be noticeably smaller (no need to pas extra parameters etc).
>
> > I changed inactive_plenty() to use "zone->size / 3" instead "zone->size /
> > 2".
> >
> > Under _my_ tests using half of the perzone total pages as the inactive
> > target was too high.
>
> This is one of the reasons I'd like to see the virtgin patch - if the "/2"
> is too high, then that should mean that the behaviour is basically
> unchanged from before, right?

Under normal balance conditions (ie not zone specific shortage), yes, the
behaviour of swap_out() is the same.

But refill_inactive_scan() is not the same: Now we'll _never_ have any
zone with more than 1/3 of its "lruable" pages on the inactive lists due
to the "unlimited" background aging. That is a good thing, of course.

> Which would be a good sign that this kicks in gently - and I agree
> that "/3" sounds saner (or even "/4" - but we should double-check that
> the global inactive function is guaranteed to never trigger with all
> zones close to the "/4" target if so).

Well, under my heavy VM tests on highmem machines this test does trigger
quite often, which is expected.

Here it goes. (against pre9)

diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/include/linux/swap.h linux/include/linux/swap.h
--- linux.orig/include/linux/swap.h Sat Jul 21 01:06:53 2001
+++ linux/include/linux/swap.h Sat Jul 21 01:16:59 2001
@@ -121,9 +121,15 @@
extern wait_queue_head_t kreclaimd_wait;
extern int page_launder(int, int);
extern int free_shortage(void);
+extern int total_free_shortage(void);
extern int inactive_shortage(void);
+extern int total_inactive_shortage(void);
extern void wakeup_kswapd(void);
extern int try_to_free_pages(unsigned int gfp_mask);
+
+extern unsigned int zone_free_shortage(zone_t *zone);
+extern unsigned int zone_inactive_shortage(zone_t *zone);
+extern unsigned int zone_inactive_plenty(zone_t *zone);

/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *);
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/page_alloc.c linux/mm/page_alloc.c
--- linux.orig/mm/page_alloc.c Sat Jul 21 01:06:53 2001
+++ linux/mm/page_alloc.c Sat Jul 21 01:11:32 2001
@@ -448,7 +448,7 @@
* to give up than to deadlock the kernel looping here.
*/
if (gfp_mask & __GFP_WAIT) {
- if (!order || free_shortage()) {
+ if (!order || total_free_shortage()) {
int progress = try_to_free_pages(gfp_mask);
if (progress || (gfp_mask & __GFP_FS))
goto try_again;
@@ -621,6 +621,53 @@
return pages;
}
#endif
+
+unsigned int zone_free_shortage(zone_t *zone)
+{
+ int sum = 0;
+
+ if (!zone->size)
+ goto ret;
+
+ if (zone->inactive_clean_pages + zone->free_pages
+ < zone->pages_min) {
+ sum += zone->pages_min;
+ sum -= zone->free_pages;
+ sum -= zone->inactive_clean_pages;
+ }
+ret:
+ return sum;
+}
+
+unsigned int zone_inactive_plenty(zone_t *zone)
+{
+ int inactive;
+
+ if (!zone->size)
+ return 0;
+
+ inactive = zone->inactive_dirty_pages;
+ inactive += zone->inactive_clean_pages;
+ inactive += zone->free_pages;
+
+ return (inactive > (zone->size / 3));
+
+}
+unsigned int zone_inactive_shortage(zone_t *zone)
+{
+ int sum = 0;
+
+ if (!zone->size)
+ goto ret;
+
+ sum = zone->pages_high;
+ sum -= zone->inactive_dirty_pages;
+ sum -= zone->inactive_clean_pages;
+ sum -= zone->free_pages;
+
+ret:
+ return (sum > 0 ? sum : 0);
+}

/*
* Show free area list (used inside shift_scroll-lock stuff)
diff --exclude-from=/home/marcelo/exclude -Nur linux.orig/mm/vmscan.c linux/mm/vmscan.c
--- linux.orig/mm/vmscan.c Sat Jul 21 01:06:53 2001
+++ linux/mm/vmscan.c Sat Jul 21 01:24:33 2001
@@ -41,6 +41,14 @@
pte_t pte;
swp_entry_t entry;

+ /*
+ * If we are doing a zone-specific scan, do not
+ * touch pages from zones which don't have a
+ * shortage.
+ */
+ if (zone_inactive_plenty(page->zone))
+ return;
+
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
page->age += PAGE_AGE_ADV;
@@ -426,7 +434,7 @@
#define MAX_LAUNDER (4 * (1 << page_cluster))
#define CAN_DO_FS (gfp_mask & __GFP_FS)
#define CAN_DO_IO (gfp_mask & __GFP_IO)
-int page_launder(int gfp_mask, int sync)
+int do_page_launder(zone_t *zone, int gfp_mask, int sync)
{
int launder_loop, maxscan, cleaned_pages, maxlaunder;
struct list_head * page_lru;
@@ -461,6 +469,17 @@
continue;
}

+ /*
+ * If we are doing zone-specific laundering,
+ * avoid touching pages from zones which do
+ * not have a free shortage.
+ */
+ if (zone && !zone_free_shortage(page->zone)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
+ continue;
+ }
+
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
@@ -574,8 +593,13 @@
* If we're freeing buffer cache pages, stop when
* we've got enough free memory.
*/
- if (freed_page && !free_shortage())
- break;
+ if (freed_page) {
+ if (zone) {
+ if (!zone_free_shortage(zone))
+ break;
+ } else if (!free_shortage())
+ break;
+ }
continue;
} else if (page->mapping && !PageDirty(page)) {
/*
@@ -613,7 +637,7 @@
* loads, flush out the dirty pages before we have to wait on
* IO.
*/
- if (CAN_DO_IO && !launder_loop && free_shortage()) {
+ if (CAN_DO_IO && !launder_loop && total_free_shortage()) {
launder_loop = 1;
/* If we cleaned pages, never do synchronous IO. */
if (cleaned_pages)
@@ -629,6 +653,33 @@
return cleaned_pages;
}

+int page_launder(int gfp_mask, int sync)
+{
+ int type = 0, ret = 0;
+ pg_data_t *pgdat = pgdat_list;
+ /*
+ * First do a global scan if there is a
+ * global shortage.
+ */
+ if (free_shortage())
+ ret += do_page_launder(NULL, gfp_mask, sync);
+
+ /*
+ * Then check if there is any specific zone
+ * needs laundering.
+ */
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ zone_t *zone = pgdat->node_zones + type;
+
+ if (zone_free_shortage(zone))
+ ret += do_page_launder(zone, gfp_mask, sync);
+ }
+
+ return ret;
+}
+
+
+
/**
* refill_inactive_scan - scan the active list and find pages to deactivate
* @priority: the priority at which to scan
@@ -637,7 +688,7 @@
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*/
-int refill_inactive_scan(unsigned int priority, int target)
+int refill_inactive_scan(zone_t *zone, unsigned int priority, int target)
{
struct list_head * page_lru;
struct page * page;
@@ -665,6 +716,16 @@
continue;
}

+ /*
+ * Do not deactivate pages from zones which
+ * have plenty inactive pages.
+ */
+
+ if (zone_inactive_plenty(page->zone)) {
+ page_active = 1;
+ goto skip_page;
+ }
+
/* Do aging on the pages. */
if (PageTestandClearReferenced(page)) {
age_page_up_nolock(page);
@@ -695,10 +756,12 @@
* we have done enough work.
*/
if (page_active || PageActive(page)) {
+skip_page:
list_del(page_lru);
list_add(page_lru, &active_list);
} else {
- nr_deactivated++;
+ if (!zone || (zone && (zone == page->zone)))
+ nr_deactivated++;
if (target && nr_deactivated >= target)
break;
}
@@ -709,19 +772,32 @@
}

/*
- * Check if there are zones with a severe shortage of free pages,
- * or if all zones have a minor shortage.
+ * Check if we have are low on free pages globally.
*/
int free_shortage(void)
{
- pg_data_t *pgdat = pgdat_list;
- int sum = 0;
int freeable = nr_free_pages() + nr_inactive_clean_pages();
int freetarget = freepages.high;

/* Are we low on free pages globally? */
if (freeable < freetarget)
return freetarget - freeable;
+ return 0;
+}
+
+/*
+ *
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int total_free_shortage(void)
+{
+ int sum = 0;
+ pg_data_t *pgdat = pgdat_list;
+
+ /* Do we have a global free shortage? */
+ if((sum = free_shortage()))
+ return sum;

/* If not, are we very low on any particular zone? */
do {
@@ -739,15 +815,15 @@
} while (pgdat);

return sum;
+
}

/*
- * How many inactive pages are we short?
+ * How many inactive pages are we short globally?
*/
int inactive_shortage(void)
{
int shortage = 0;
- pg_data_t *pgdat = pgdat_list;

/* Is the inactive dirty list too small? */

@@ -759,10 +835,20 @@

if (shortage > 0)
return shortage;
+ return 0;
+}
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int total_inactive_shortage(void)
+{
+ int shortage = 0;
+ pg_data_t *pgdat = pgdat_list;

- /* If not, do we have enough per-zone pages on the inactive list? */
+ if((shortage = inactive_shortage()))
+ return shortage;

- shortage = 0;
+ shortage = 0;

do {
int i;
@@ -802,7 +888,7 @@
* when called from a user process.
*/
#define DEF_PRIORITY (6)
-static int refill_inactive(unsigned int gfp_mask, int user)
+static int refill_inactive_global(unsigned int gfp_mask, int user)
{
int count, start_count, maxtry;

@@ -826,7 +912,7 @@
/* Walk the VM space for a bit.. */
swap_out(DEF_PRIORITY, gfp_mask);

- count -= refill_inactive_scan(DEF_PRIORITY, count);
+ count -= refill_inactive_scan(NULL, DEF_PRIORITY, count);
if (count <= 0)
goto done;

@@ -839,6 +925,59 @@
return (count < start_count);
}

+static int refill_inactive_zone(zone_t *zone, unsigned int gfp_mask, int user)
+{
+ int count, start_count, maxtry;
+
+ count = start_count = zone_inactive_shortage(zone);
+
+ maxtry = (1 << DEF_PRIORITY);
+
+ do {
+ swap_out(DEF_PRIORITY, gfp_mask);
+
+ count -= refill_inactive_scan(zone, DEF_PRIORITY, count);
+
+ if (count <= 0)
+ goto done;
+
+ if (--maxtry <= 0)
+ return 0;
+
+ } while(zone_inactive_shortage(zone));
+done:
+ return (count < start_count);
+}
+
+
+static int refill_inactive(unsigned int gfp_mask, int user)
+{
+ int type = 0, ret = 0;
+ pg_data_t *pgdat = pgdat_list;
+ /*
+ * First do a global scan if there is a
+ * global shortage.
+ */
+ if (inactive_shortage())
+ ret += refill_inactive_global(gfp_mask, user);
+
+ /*
+ * Then check if there is any specific zone
+ * with a shortage and try to refill it if
+ * so.
+ */
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ zone_t *zone = pgdat->node_zones + type;
+
+ if (zone_inactive_shortage(zone))
+ ret += refill_inactive_zone(zone, gfp_mask, user);
+ }
+
+ return ret;
+}
+
+#define DEF_PRIORITY (6)
+
static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
int ret = 0;
@@ -851,8 +990,10 @@
* before we get around to moving them to the other
* list, so this is a relatively cheap operation.
*/
- if (free_shortage()) {
- ret += page_launder(gfp_mask, user);
+
+ ret += page_launder(gfp_mask, user);
+
+ if (total_free_shortage()) {
shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
shrink_icache_memory(DEF_PRIORITY, gfp_mask);
}
@@ -861,8 +1002,7 @@
* If needed, we move pages from the active list
* to the inactive list.
*/
- if (inactive_shortage())
- ret += refill_inactive(gfp_mask, user);
+ ret += refill_inactive(gfp_mask, user);

/*
* Reclaim unused slab cache if memory is low.
@@ -917,7 +1057,7 @@
static long recalc = 0;

/* If needed, try to free some memory. */
- if (inactive_shortage() || free_shortage())
+ if (total_inactive_shortage() || total_free_shortage())
do_try_to_free_pages(GFP_KSWAPD, 0);

/* Once a second ... */
@@ -928,7 +1068,7 @@
recalculate_vm_stats();

/* Do background page aging. */
- refill_inactive_scan(DEF_PRIORITY, 0);
+ refill_inactive_scan(NULL, DEF_PRIORITY, 0);
}

run_task_queue(&tq_disk);
@@ -944,7 +1084,7 @@
* We go to sleep for one second, but if it's needed
* we'll be woken up earlier...
*/
- if (!free_shortage() || !inactive_shortage()) {
+ if (!total_free_shortage() || !total_inactive_shortage()) {
interruptible_sleep_on_timeout(&kswapd_wait, HZ);
/*
* If we couldn't free enough memory, we see if it was