2007-01-15 09:39:53

by Roy Huang

[permalink] [raw]
Subject: [PATCH] Provide an interface to limit total page cache.

A patch provide a interface to limit total page cache in
/proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
feedback is appreciated.

-Roy

diff -urp a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h 2006-11-30 05:57:37.000000000 +0800
+++ b/include/linux/pagemap.h 2007-01-15 17:03:09.000000000 +0800
@@ -12,6 +12,12 @@
#include <asm/uaccess.h>
#include <linux/gfp.h>

+extern int pagecache_ratio;
+extern long pagecache_limit;
+
+int pagecache_ratio_sysctl_handler(struct ctl_table *, int,
+ struct file *, void __user *, size_t *, loff_t *);
+
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
* allocation mode flags.
diff -urp a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h 2007-01-15 17:18:46.000000000 +0800
+++ b/include/linux/sysctl.h 2007-01-15 17:03:09.000000000 +0800
@@ -202,6 +202,7 @@ enum
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_PAGECACHE_RATIO=36, /* Percent memory is used as page cache */
};


diff -urp a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c 2007-01-15 17:18:46.000000000 +0800
+++ b/kernel/sysctl.c 2007-01-15 17:03:09.000000000 +0800
@@ -1035,6 +1035,15 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
},
#endif
+ {
+ .ctl_name = VM_PAGECACHE_RATIO,
+ .procname = "pagecache_ratio",
+ .data = &pagecache_ratio,
+ .maxlen = sizeof(pagecache_ratio),
+ .mode = 0644,
+ .proc_handler = &pagecache_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

diff -urp a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c 2007-01-15 17:18:46.000000000 +0800
+++ b/mm/filemap.c 2007-01-15 17:03:09.000000000 +0800
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/sysctl.h>
#include "filemap.h"
#include "internal.h"

@@ -108,6 +109,48 @@ generic_file_direct_IO(int rw, struct ki
*/

/*
+ * Start release pagecache (via kswapd) at the percentage.
+ */
+int pagecache_ratio __read_mostly = 90;
+
+long pagecache_limit = 0;
+
+int setup_pagecache_limit(void)
+{
+ pagecache_limit = pagecache_ratio * nr_free_pagecache_pages() / 100;
+ return 0;
+}
+
+int pagecache_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ setup_pagecache_limit();
+ return 0;
+}
+
+static inline int balance_pagecache(void)
+{
+ if (global_page_state(NR_FILE_PAGES) > pagecache_limit) {
+ int nid, j;
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_node(nid) {
+ pgdat = NODE_DATA(nid);
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ zone = pgdat->node_zones + j;
+ wakeup_kswapd(zone, 0);
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(setup_pagecache_limit)
+
+/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold a write_lock on the mapping's tree_lock.
@@ -1085,6 +1128,8 @@ out:
page_cache_release(cached_page);
if (filp)
file_accessed(filp);
+
+ balance_pagecache();
}
EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2212,6 +2257,8 @@ zero_length_segment:
status = filemap_write_and_wait(mapping);

pagevec_lru_add(&lru_pvec);
+ balance_pagecache();
+
return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);
diff -urp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c 2007-01-15 17:18:46.000000000 +0800
+++ b/mm/vmscan.c 2007-01-15 17:03:09.000000000 +0800
@@ -1316,6 +1316,7 @@ static int kswapd(void *p)
order = 0;
for ( ; ; ) {
unsigned long new_order;
+ long over_limit;

try_to_freeze();

@@ -1335,6 +1336,9 @@ static int kswapd(void *p)
finish_wait(&pgdat->kswapd_wait, &wait);

balance_pgdat(pgdat, order);
+ over_limit = global_page_state(NR_FILE_PAGES) - pagecache_limit;
+ if (over_limit > 0)
+ shrink_all_memory(over_limit);
}
return 0;
}
@@ -1350,8 +1354,10 @@ void wakeup_kswapd(struct zone *zone, in
return;

pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
- return;
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) {
+ if (global_page_state(NR_FILE_PAGES) < pagecache_limit)
+ return;
+ }
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1361,7 +1367,6 @@ void wakeup_kswapd(struct zone *zone, in
wake_up_interruptible(&pgdat->kswapd_wait);
}

-#ifdef CONFIG_PM
/*
* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
* from LRU lists system-wide, for given pass and priority, and returns the
@@ -1510,7 +1515,6 @@ out:

return ret;
}
-#endif

/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes


2007-01-15 11:01:25

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

On 1/15/07, Roy Huang <[email protected]> wrote:
> A patch provide a interface to limit total page cache in
> /proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
> feedback is appreciated.
>

[snip]

wakeup_kswapd and shrink_all_memory use swappiness to determine what to reclaim
(mapped pages or page cache). This patch does not ensure that only
page cache is
reclaimed/limited. If the swappiness value is high, mapped pages will be hit.

One could get similar functionality by implementing resource management.

Resource management splits tasks into groups and does management of
resources for the
groups rather than the whole system. Such a facility will come with a
resource controller for
memory (split into finer grain rss/page cache/mlock'ed memory, etc),
one for cpu, etc.

Balbir

2007-01-15 11:57:53

by Vaidyanathan Srinivasan

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.


Roy Huang wrote:
> A patch provide a interface to limit total page cache in
> /proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
> feedback is appreciated.

[snip]

I tried to run your patch on PPC64 SMP machine, unfortunately kswapd
crashes the kernel when the pagecache limit is exceeded!

->dd if=/dev/zero of=/tmp/foo bs=1M count=1200
cpu 0x0: Vector: 300 (Data Access) at [c0000000012d7ad0]
pc: c0000000000976ac: .kswapd+0x3a4/0x4f0
lr: c0000000000976ac: .kswapd+0x3a4/0x4f0
sp: c0000000012d7d50
msr: 8000000000009032
dar: 0
dsisr: 42000000
current = 0xc00000000fed7040
paca = 0xc00000000063fb80
pid = 134, comm = kswapd0
------------[ cut here ]------------
enter ? for help
[c0000000012d7ee0] c000000000069150 .kthread+0x124/0x174
[c0000000012d7f90] c0000000000247b4 .kernel_thread+0x4c/0x68
0:mon>

Steps to recreate fail:

# sync
# echo 1 > /proc/sys/vm/drop_caches
MemTotal: 1014584 kB
MemFree: 905536 kB
Buffers: 3232 kB
Cached: 57628 kB
SwapCached: 0 kB
Active: 47664 kB
Inactive: 33160 kB
SwapTotal: 1526164 kB
SwapFree: 1526164 kB
Dirty: 108 kB
Writeback: 0 kB
AnonPages: 19976 kB
Mapped: 15084 kB
Slab: 19724 kB
SReclaimable: 8536 kB
SUnreclaim: 11188 kB
PageTables: 972 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
CommitLimit: 2033456 kB
Committed_AS: 87884 kB
VmallocTotal: 8589934592 kB
VmallocUsed: 2440 kB
VmallocChunk: 8589932152 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
Hugepagesize: 16384 kB

# echo 50 > /proc/sys/vm/pagecache_ratio
# dd if=/dev/zero of=/tmp/foo bs=1M count=1200

Basically fill pagecache with overlimit dirty file pages and check
if the reclaim happened and the limit was not exceeded.

--Vaidy



2007-01-16 02:34:55

by Roy Huang

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

Hi Balbir,

Thanks for your comment.

On 1/15/07, Balbir Singh <[email protected]> wrote:

> wakeup_kswapd and shrink_all_memory use swappiness to determine what to reclaim
> (mapped pages or page cache). This patch does not ensure that only
> page cache is
> reclaimed/limited. If the swappiness value is high, mapped pages will be hit.
>
You are right, it is possible to release mapped pages. It can be
avoided by add a field in "struct scan_control" to determine whether
mapped pages will be released.

> One could get similar functionality by implementing resource management.
>
> Resource management splits tasks into groups and does management of
> resources for the
> groups rather than the whole system. Such a facility will come with a
> resource controller for
> memory (split into finer grain rss/page cache/mlock'ed memory, etc),
> one for cpu, etc.
I s there any more information in detail about resource controller?
Even there is a resource controller for tasks, all memory is also
possbile to be eaten up by page cache.
>
> Balbir
>

2007-01-16 02:40:55

by Roy Huang

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

The possible cause is a bug in kswapd thread, or shrink_all_memory
cannot be called in kswapd thread.

On 1/15/07, Vaidyanathan Srinivasan <[email protected]> wrote:
>
> Roy Huang wrote:
> > A patch provide a interface to limit total page cache in
> > /proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
> > feedback is appreciated.
>
> [snip]
>
> I tried to run your patch on PPC64 SMP machine, unfortunately kswapd
> crashes the kernel when the pagecache limit is exceeded!
>
> ->dd if=/dev/zero of=/tmp/foo bs=1M count=1200
> cpu 0x0: Vector: 300 (Data Access) at [c0000000012d7ad0]
> pc: c0000000000976ac: .kswapd+0x3a4/0x4f0
> lr: c0000000000976ac: .kswapd+0x3a4/0x4f0
> sp: c0000000012d7d50
> msr: 8000000000009032
> dar: 0
> dsisr: 42000000
> current = 0xc00000000fed7040
> paca = 0xc00000000063fb80
> pid = 134, comm = kswapd0
> ------------[ cut here ]------------
> enter ? for help
> [c0000000012d7ee0] c000000000069150 .kthread+0x124/0x174
> [c0000000012d7f90] c0000000000247b4 .kernel_thread+0x4c/0x68
> 0:mon>
>
> Steps to recreate fail:
>
> # sync
> # echo 1 > /proc/sys/vm/drop_caches
> MemTotal: 1014584 kB
> MemFree: 905536 kB
> Buffers: 3232 kB
> Cached: 57628 kB
> SwapCached: 0 kB
> Active: 47664 kB
> Inactive: 33160 kB
> SwapTotal: 1526164 kB
> SwapFree: 1526164 kB
> Dirty: 108 kB
> Writeback: 0 kB
> AnonPages: 19976 kB
> Mapped: 15084 kB
> Slab: 19724 kB
> SReclaimable: 8536 kB
> SUnreclaim: 11188 kB
> PageTables: 972 kB
> NFS_Unstable: 0 kB
> Bounce: 0 kB
> CommitLimit: 2033456 kB
> Committed_AS: 87884 kB
> VmallocTotal: 8589934592 kB
> VmallocUsed: 2440 kB
> VmallocChunk: 8589932152 kB
> HugePages_Total: 0
> HugePages_Free: 0
> HugePages_Rsvd: 0
> Hugepagesize: 16384 kB
>
> # echo 50 > /proc/sys/vm/pagecache_ratio
> # dd if=/dev/zero of=/tmp/foo bs=1M count=1200
>
> Basically fill pagecache with overlimit dirty file pages and check
> if the reclaim happened and the limit was not exceeded.
>
> --Vaidy
>
>
>
>

2007-01-16 09:57:45

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

Roy Huang wrote:
> Hi Balbir,
>
> Thanks for your comment.
>
> On 1/15/07, Balbir Singh <[email protected]> wrote:
>
>> wakeup_kswapd and shrink_all_memory use swappiness to determine what to reclaim
>> (mapped pages or page cache). This patch does not ensure that only
>> page cache is
>> reclaimed/limited. If the swappiness value is high, mapped pages will be hit.
>>
> You are right, it is possible to release mapped pages. It can be
> avoided by add a field in "struct scan_control" to determine whether
> mapped pages will be released.
>

Yes that could be done. I have been trying to figure out if there is a good
reason why the LRU is common for both mapped and pagecache. Does it make
sense to split them up? I am still digging through lkml archives to see
if I can find something.

>> One could get similar functionality by implementing resource management.
>>
>> Resource management splits tasks into groups and does management of
>> resources for the
>> groups rather than the whole system. Such a facility will come with a
>> resource controller for
>> memory (split into finer grain rss/page cache/mlock'ed memory, etc),
>> one for cpu, etc.
> I s there any more information in detail about resource controller?
> Even there is a resource controller for tasks, all memory is also
> possbile to be eaten up by page cache.


Yes, please see the discussions on lkml on resource management, ckrm,
beancounters and containers.

http://lwn.net/Articles/206697/ RFC for memory controller, might be a good
starting point

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2007-01-17 14:56:03

by Vaidyanathan Srinivasan

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.


Hi Roy,

I have added a different pagecache reclaim logic around your
sysctl interface. This would ensure that only pagecache pages are
reclaimed if the limit is exceeded.

--Vaidy

Pagecache pages in memory can be limited to a percentage of total
RAM using this patch.

New sysctl entry /proc/sys/vm/pagecache_ratio has been added that
holds the total percentage of RAM that the user wants as pagecache.
The default percentage is 90.

Depending on the work load, any percentage value can be set to derive
optimum overall performance. Minimum is 5 and max is 100.

balance_pagecache() routine is called on file backed access and the
current pagecache_limit is checked against utilisation.

If the limit is exceeded, then shrink_all_pagecache_memory() is
called that will walk the LRU list and remove unmapped pagecache
pages. New scancontrol fields have been added to make decisions
in shrink_page_list() and shrink_active_list().

Pages counted under pagecache limit are file pages that are not mapped.
Shared memory is mapped and not counted in the limit.

Test:

echo 40 > /proc/sys/vm/pagecache_ratio
(that is around 400MB on a 1GB RAM machine)

dd if=/dev/zero of=/tmp/foo bs=1M count=1024

cat /proc/meminfo
The "Cached: xxx" count should hit the set limit and not consume all
available memory.

Any feedback is appreciated.

Signed-off-by: Roy Huang <[email protected]>
Signed-off-by: Vaidyanathan Srinivasan <[email protected]>
---
include/linux/pagemap.h | 6 +++
include/linux/sysctl.h | 1
kernel/sysctl.c | 9 +++++
mm/filemap.c | 65 +++++++++++++++++++++++++++++++++++++++
mm/vmscan.c | 79 +++++++++++++++++++++++++++++++++++++++++++++---
5 files changed, 156 insertions(+), 4 deletions(-)

--- linux-2.6.20-rc5.orig/include/linux/pagemap.h
+++ linux-2.6.20-rc5/include/linux/pagemap.h
@@ -12,6 +12,12 @@
#include <asm/uaccess.h>
#include <linux/gfp.h>

+extern int pagecache_ratio;
+extern unsigned int pagecache_limit;
+
+extern int pagecache_ratio_sysctl_handler(struct ctl_table *, int,
+ struct file *, void __user *, size_t *, loff_t *);
+
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
* allocation mode flags.
--- linux-2.6.20-rc5.orig/include/linux/sysctl.h
+++ linux-2.6.20-rc5/include/linux/sysctl.h
@@ -202,6 +202,7 @@ enum
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_PAGECACHE_RATIO=36, /* Percent memory is used as page cache */
};


--- linux-2.6.20-rc5.orig/kernel/sysctl.c
+++ linux-2.6.20-rc5/kernel/sysctl.c
@@ -1035,6 +1035,15 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
},
#endif
+ {
+ .ctl_name = VM_PAGECACHE_RATIO,
+ .procname = "pagecache_ratio",
+ .data = &pagecache_ratio,
+ .maxlen = sizeof(pagecache_ratio),
+ .mode = 0644,
+ .proc_handler = &pagecache_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

--- linux-2.6.20-rc5.orig/mm/filemap.c
+++ linux-2.6.20-rc5/mm/filemap.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/sysctl.h>
#include "filemap.h"
#include "internal.h"

@@ -108,6 +109,66 @@ generic_file_direct_IO(int rw, struct ki
*/

/*
+ * Start release pagecache (via kswapd) at the percentage.
+ */
+int pagecache_ratio __read_mostly = 90;
+
+unsigned int pagecache_limit = 0;
+
+#define PAGECACHE_RECLAIM_THRESHOLD 64 /* Call reclaim after exceeding
+ the limit by this threshold */
+
+int setup_pagecache_limit(void)
+{
+ if (pagecache_ratio > 100)
+ pagecache_ratio = 100;
+ if (pagecache_ratio < 5)
+ pagecache_ratio = 5;
+ pagecache_limit = pagecache_ratio * nr_free_pagecache_pages() / 100;
+ return 0;
+}
+
+int pagecache_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ setup_pagecache_limit();
+ return 0;
+}
+
+extern unsigned long shrink_all_pagecache_memory(unsigned long nr_pages);
+
+int check_pagecache_overlimit(void)
+{
+ unsigned long current_pagecache;
+ int nr_pages = 0;
+
+ current_pagecache = global_page_state(NR_FILE_PAGES) -
+ global_page_state(NR_FILE_MAPPED);
+ /* NR_FILE_PAGES includes shared memory, swap cache and
+ * buffers. Hence exclude NR_FILE_MAPPED, since we would
+ * not reclaim mapped pages. Unmapped pagecache pages
+ * is what we really want to target */
+ if ( pagecache_limit && current_pagecache > pagecache_limit)
+ nr_pages = current_pagecache - pagecache_limit;
+
+ return nr_pages;
+}
+
+static inline int balance_pagecache(void)
+{
+ unsigned long nr_pages;
+ unsigned long ret;
+ nr_pages = check_pagecache_overlimit();
+ /* Don't call reclaim for each page */
+ if (nr_pages > PAGECACHE_RECLAIM_THRESHOLD)
+ ret = shrink_all_pagecache_memory(nr_pages);
+ return 0;
+}
+
+__initcall(setup_pagecache_limit);
+
+/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold a write_lock on the mapping's tree_lock.
@@ -1085,6 +1146,8 @@ out:
page_cache_release(cached_page);
if (filp)
file_accessed(filp);
+
+ balance_pagecache();
}
EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2212,6 +2275,8 @@ zero_length_segment:
status = filemap_write_and_wait(mapping);

pagevec_lru_add(&lru_pvec);
+ balance_pagecache();
+
return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);
--- linux-2.6.20-rc5.orig/mm/vmscan.c
+++ linux-2.6.20-rc5/mm/vmscan.c
@@ -45,6 +45,9 @@

#include "internal.h"

+
+extern int check_pagecache_overlimit(void);
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -66,6 +69,10 @@ struct scan_control {
int swappiness;

int all_unreclaimable;
+
+ int reclaim_pagecache_only; /* Set when called from
+ pagecache controller */
+
};

/*
@@ -470,7 +477,15 @@ static unsigned long shrink_page_list(st
goto keep;

VM_BUG_ON(PageActive(page));
-
+ /* Take it easy if we are doing only pagecache pages */
+ if (sc->reclaim_pagecache_only) {
+ /* Check if this is a pagecache page they are not mapped */
+ if (page_mapped(page))
+ goto keep_locked;
+ /* Check if pagecache limit is exceeded */
+ if (!check_pagecache_overlimit())
+ goto keep_locked;
+ }
sc->nr_scanned++;

if (!sc->may_swap && page_mapped(page))
@@ -518,7 +533,8 @@ static unsigned long shrink_page_list(st
}

if (PageDirty(page)) {
- if (referenced)
+ /* Reclaim even referenced pagecache pages if over limit */
+ if (!check_pagecache_overlimit() && referenced)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
@@ -832,6 +848,14 @@ force_reclaim_mapped:
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
+ /* While reclaiming pagecache make it easy */
+ if (sc->reclaim_pagecache_only) {
+ if (page_mapped(page) || !check_pagecache_overlimit()) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
+ }
+
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
@@ -1027,6 +1051,7 @@ unsigned long try_to_free_pages(struct z
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .reclaim_pagecache_only = 0,
};

count_vm_event(ALLOCSTALL);
@@ -1131,6 +1156,7 @@ static unsigned long balance_pgdat(pg_da
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
+ .reclaim_pagecache_only = 0,
};
/*
* temp_priority is used to remember the scanning priority at which
@@ -1361,7 +1387,6 @@ void wakeup_kswapd(struct zone *zone, in
wake_up_interruptible(&pgdat->kswapd_wait);
}

-#ifdef CONFIG_PM
/*
* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
* from LRU lists system-wide, for given pass and priority, and returns the
@@ -1436,6 +1461,7 @@ unsigned long shrink_all_memory(unsigned
.swap_cluster_max = nr_pages,
.may_writepage = 1,
.swappiness = vm_swappiness,
+ .reclaim_pagecache_only = 0,
};

current->reclaim_state = &reclaim_state;
@@ -1510,7 +1536,52 @@ out:

return ret;
}
-#endif
+
+unsigned long shrink_all_pagecache_memory(unsigned long nr_pages)
+{
+ unsigned long ret = 0;
+ int pass;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 0,
+ .swap_cluster_max = nr_pages,
+ .may_writepage = 1,
+ .swappiness = 0, /* Do not swap, only pagecache reclaim */
+ .reclaim_pagecache_only = 1, /* Flag it */
+ };
+
+ current->reclaim_state = &reclaim_state;
+
+ /*
+ * We try to shrink LRUs in 5 passes:
+ * 0 = Reclaim from inactive_list only
+ * 1 = Reclaim from active list but don't reclaim mapped
+ * 2 = 2nd pass of type 1
+ * 3 = Reclaim mapped (normal reclaim)
+ * 4 = 2nd pass of type 3
+ */
+ for (pass = 0; pass < 5; pass++) {
+ int prio;
+
+ for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+ unsigned long nr_to_scan = nr_pages - ret;
+ sc.nr_scanned = 0;
+ ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+ if (ret >= nr_pages)
+ goto out;
+
+ if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ / 10);
+ }
+ }
+
+
+out:
+ current->reclaim_state = NULL;
+
+ return ret;
+}

/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes

2007-01-18 07:56:24

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

"Roy Huang" <[email protected]> writes:

> A patch provide a interface to limit total page cache in
> /proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
> feedback is appreciated.

Anything except a default value of 100% will change the behavior
and probably reduce the performance on most systems.

> -Roy
>
> diff -urp a/include/linux/sysctl.h b/include/linux/sysctl.h
> --- a/include/linux/sysctl.h 2007-01-15 17:18:46.000000000 +0800
> +++ b/include/linux/sysctl.h 2007-01-15 17:03:09.000000000 +0800
> @@ -202,6 +202,7 @@ enum
> VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
> VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
> VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
> + VM_PAGECACHE_RATIO=36, /* Percent memory is used as page cache */
> };
>
>
> diff -urp a/kernel/sysctl.c b/kernel/sysctl.c
> --- a/kernel/sysctl.c 2007-01-15 17:18:46.000000000 +0800
> +++ b/kernel/sysctl.c 2007-01-15 17:03:09.000000000 +0800
> @@ -1035,6 +1035,15 @@ static ctl_table vm_table[] = {
> .extra1 = &zero,
> },
> #endif
> + {
> + .ctl_name = VM_PAGECACHE_RATIO,
> + .procname = "pagecache_ratio",
> + .data = &pagecache_ratio,
> + .maxlen = sizeof(pagecache_ratio),
> + .mode = 0644,
> + .proc_handler = &pagecache_ratio_sysctl_handler,
> + .strategy = &sysctl_intvec,
> + },
> { .ctl_name = 0 }
> };

This is broken.

You have allocated a binary number for use with sys_sysctl but
did not test it.

If you need a special proc_handler to take action when the
value is changed you need a special strategy routine.

So since you aren't going to test the binary interface and don't
care about it please don't allocate a number for it and just
use CTL_UNNUMBERED.

And of course please read the top of linux/sysctl.h

Thank you.

Eric

2007-01-18 14:00:21

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] Provide an interface to limit total page cache.

Hi!

> A patch provide a interface to limit total page cache in
> /proc/sys/vm/pagecache_ratio. The default value is 90
> percent. Any
> feedback is appreciated.

Are you sure percentage is right thing to use? 1% of 200GB machine is
2GB... granularity seems too big here. KB? parts per million?

Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html