diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/fs/buffer.c linux-2.6.0-test9-setup_perzone_counters/fs/buffer.c
--- linux-2.6.0-test9-mm4/fs/buffer.c Wed Nov 19 15:22:46 2003
+++ linux-2.6.0-test9-setup_perzone_counters/fs/buffer.c Wed Nov 19 15:26:22 2003
@@ -888,7 +888,7 @@ int __set_page_dirty_buffers(struct page
spin_lock(&mapping->page_lock);
if (page->mapping) { /* Race with truncate? */
if (!mapping->backing_dev_info->memory_backed)
- inc_page_state(nr_dirty);
+ inc_perzone_page_state(nr_dirty, page);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
}
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/fs/nfs/write.c linux-2.6.0-test9-setup_perzone_counters/fs/nfs/write.c
--- linux-2.6.0-test9-mm4/fs/nfs/write.c Sat Oct 25 11:44:12 2003
+++ linux-2.6.0-test9-setup_perzone_counters/fs/nfs/write.c Thu Nov 20 13:40:18 2003
@@ -368,6 +368,7 @@ nfs_mark_request_dirty(struct nfs_page *
nfs_list_add_request(req, &nfsi->dirty);
nfsi->ndirty++;
spin_unlock(&nfs_wreq_lock);
+ /* FIXME - NFS perzone page accounting broken! */
inc_page_state(nr_dirty);
mark_inode_dirty(inode);
}
@@ -396,6 +397,7 @@ nfs_mark_request_commit(struct nfs_page
nfs_list_add_request(req, &nfsi->commit);
nfsi->ncommit++;
spin_unlock(&nfs_wreq_lock);
+ /* FIXME - NFS perzone page accounting broken! */
inc_page_state(nr_unstable);
mark_inode_dirty(inode);
}
@@ -464,7 +466,8 @@ nfs_scan_dirty(struct inode *inode, stru
int res;
res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
nfsi->ndirty -= res;
- sub_page_state(nr_dirty,res);
+ /* FIXME - NFS perzone page accounting broken! */
+ mod_page_state(nr_dirty, 0UL - res);
if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
return res;
@@ -1006,7 +1009,6 @@ nfs_commit_done(struct rpc_task *task)
{
struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata;
struct nfs_page *req;
- int res = 0;
dprintk("NFS: %4d nfs_commit_done (status %d)\n",
task->tk_pid, task->tk_status);
@@ -1040,10 +1042,10 @@ nfs_commit_done(struct rpc_task *task)
dprintk(" mismatch\n");
nfs_mark_request_dirty(req);
next:
+ /* FIXME - NFS perzone page accounting broken! */
+ dec_page_state(nr_unstable);
nfs_unlock_request(req);
- res++;
}
- sub_page_state(nr_unstable,res);
}
#endif
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/include/asm-arm26/rmap.h linux-2.6.0-test9-setup_perzone_counters/include/asm-arm26/rmap.h
--- linux-2.6.0-test9-mm4/include/asm-arm26/rmap.h Sat Oct 25 11:43:29 2003
+++ linux-2.6.0-test9-setup_perzone_counters/include/asm-arm26/rmap.h Wed Nov 19 15:26:22 2003
@@ -16,14 +16,14 @@ static inline void pgtable_add_rmap(stru
{
page->mapping = (void *)mm;
page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
- inc_page_state(nr_page_table_pages);
+ inc_perzone_page_state(nr_page_table_pages, page);
}
static inline void pgtable_remove_rmap(struct page *page)
{
page->mapping = NULL;
page->index = 0;
- dec_page_state(nr_page_table_pages);
+ dec_perzone_page_state(nr_page_table_pages, page);
}
static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/include/asm-generic/rmap.h linux-2.6.0-test9-setup_perzone_counters/include/asm-generic/rmap.h
--- linux-2.6.0-test9-mm4/include/asm-generic/rmap.h Sat Oct 25 11:43:56 2003
+++ linux-2.6.0-test9-setup_perzone_counters/include/asm-generic/rmap.h Wed Nov 19 15:26:22 2003
@@ -37,14 +37,14 @@ static inline void pgtable_add_rmap(stru
#endif
page->mapping = (void *)mm;
page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
- inc_page_state(nr_page_table_pages);
+ inc_perzone_page_state(nr_page_table_pages, page);
}
static inline void pgtable_remove_rmap(struct page * page)
{
page->mapping = NULL;
page->index = 0;
- dec_page_state(nr_page_table_pages);
+ dec_perzone_page_state(nr_page_table_pages, page);
}
static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/include/linux/mmzone.h linux-2.6.0-test9-setup_perzone_counters/include/linux/mmzone.h
--- linux-2.6.0-test9-mm4/include/linux/mmzone.h Wed Nov 19 15:22:48 2003
+++ linux-2.6.0-test9-setup_perzone_counters/include/linux/mmzone.h Fri Nov 21 14:21:34 2003
@@ -55,6 +55,21 @@ struct per_cpu_pageset {
} ____cacheline_aligned_in_smp;
/*
+ * Per-zone page accounting. One instance per-zone per-CPU. This structure
+ * should mirror the fields of struct page_state (from in linux/page-flags.h)
+ * that are used by balance_dirty_pages_ratelimited (currently all the 'nr_'
+ * fields). Only unsigned longs are allowed.
+ */
+struct perzone_page_state {
+ unsigned long nr_dirty;
+ unsigned long nr_writeback;
+ unsigned long nr_unstable;
+ unsigned long nr_page_table_pages;
+ unsigned long nr_mapped;
+ unsigned long nr_slab;
+} ____cacheline_aligned_in_smp;
+
+/*
* On machines where it is needed (eg PCs) we divide physical memory
* into multiple physical zones. On a PC we have 3 zones:
*
@@ -140,6 +155,8 @@ struct zone {
struct per_cpu_pageset pageset[NR_CPUS];
+ struct perzone_page_state page_state[NR_CPUS];
+
/*
* Discontig memory support fields.
*/
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/include/linux/page-flags.h linux-2.6.0-test9-setup_perzone_counters/include/linux/page-flags.h
--- linux-2.6.0-test9-mm4/include/linux/page-flags.h Sat Oct 25 11:44:06 2003
+++ linux-2.6.0-test9-setup_perzone_counters/include/linux/page-flags.h Fri Nov 21 15:12:42 2003
@@ -88,7 +88,6 @@ struct page_state {
unsigned long nr_page_table_pages;/* Pages used for pagetables */
unsigned long nr_mapped; /* mapped into pagetables */
unsigned long nr_slab; /* In slab */
-#define GET_PAGE_STATE_LAST nr_slab
/*
* The below are zeroed by get_page_state(). Use get_full_page_state()
@@ -117,12 +116,16 @@ struct page_state {
unsigned long allocstall; /* direct reclaim calls */
unsigned long pgrotated; /* pages rotated to tail of the LRU */
} ____cacheline_aligned;
-
DECLARE_PER_CPU(struct page_state, page_states);
+extern void get_page_state_zone(struct page_state *ret, struct zone *zone);
extern void get_page_state(struct page_state *ret);
extern void get_full_page_state(struct page_state *ret);
+/*
+ * Use these macros to modify the page statistics that don't start with 'nr_'
+ * which are maintained solely on a per-cpu basis.
+ */
#define mod_page_state(member, delta) \
do { \
unsigned long flags; \
@@ -130,10 +133,22 @@ extern void get_full_page_state(struct p
__get_cpu_var(page_states).member += (delta); \
local_irq_restore(flags); \
} while (0)
-
#define inc_page_state(member) mod_page_state(member, 1UL)
#define dec_page_state(member) mod_page_state(member, 0UL - 1)
-#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
+
+/*
+ * Use these macros to modify the 'nr_' page statistics which are maintained
+ * on a per-zone per-cpu basis.
+ */
+#define mod_perzone_page_state(member, page, delta) \
+ do { \
+ unsigned long flags; \
+ local_irq_save(flags); \
+ page_zone(page)->page_state[smp_processor_id()].member += (delta); \
+ local_irq_restore(flags); \
+ } while (0)
+#define inc_perzone_page_state(member, page) mod_perzone_page_state(member, page, 1UL)
+#define dec_perzone_page_state(member, page) mod_perzone_page_state(member, page, 0UL - 1)
/*
@@ -217,7 +232,7 @@ extern void get_full_page_state(struct p
do { \
if (!test_and_set_bit(PG_writeback, \
&(page)->flags)) \
- inc_page_state(nr_writeback); \
+ inc_perzone_page_state(nr_writeback, page); \
} while (0)
#define TestSetPageWriteback(page) \
({ \
@@ -225,14 +240,14 @@ extern void get_full_page_state(struct p
ret = test_and_set_bit(PG_writeback, \
&(page)->flags); \
if (!ret) \
- inc_page_state(nr_writeback); \
+ inc_perzone_page_state(nr_writeback, page); \
ret; \
})
#define ClearPageWriteback(page) \
do { \
if (test_and_clear_bit(PG_writeback, \
&(page)->flags)) \
- dec_page_state(nr_writeback); \
+ dec_perzone_page_state(nr_writeback, page); \
} while (0)
#define TestClearPageWriteback(page) \
({ \
@@ -240,7 +255,7 @@ extern void get_full_page_state(struct p
ret = test_and_clear_bit(PG_writeback, \
&(page)->flags); \
if (ret) \
- dec_page_state(nr_writeback); \
+ dec_perzone_page_state(nr_writeback, page); \
ret; \
})
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/mm/page-writeback.c linux-2.6.0-test9-setup_perzone_counters/mm/page-writeback.c
--- linux-2.6.0-test9-mm4/mm/page-writeback.c Wed Nov 19 15:22:49 2003
+++ linux-2.6.0-test9-setup_perzone_counters/mm/page-writeback.c Wed Nov 19 15:26:22 2003
@@ -524,7 +524,7 @@ int __set_page_dirty_nobuffers(struct pa
if (page->mapping) { /* Race with truncate? */
BUG_ON(page->mapping != mapping);
if (!mapping->backing_dev_info->memory_backed)
- inc_page_state(nr_dirty);
+ inc_perzone_page_state(nr_dirty, page);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
}
@@ -569,7 +569,7 @@ int test_clear_page_dirty(struct page *p
struct address_space *mapping = page->mapping;
if (mapping && !mapping->backing_dev_info->memory_backed)
- dec_page_state(nr_dirty);
+ dec_perzone_page_state(nr_dirty, page);
return 1;
}
return 0;
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/mm/page_alloc.c linux-2.6.0-test9-setup_perzone_counters/mm/page_alloc.c
--- linux-2.6.0-test9-mm4/mm/page_alloc.c Wed Nov 19 15:22:49 2003
+++ linux-2.6.0-test9-setup_perzone_counters/mm/page_alloc.c Fri Nov 21 14:27:51 2003
@@ -859,11 +859,47 @@ EXPORT_SYMBOL(nr_pagecache);
DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
#endif
-void __get_page_state(struct page_state *ret, int nr)
+/*
+ * Get the zone-specific page stats for @zone, summed across all cpus.
+ * Make sure you zero @ret before passing it in!
+ */
+void get_page_state_zone(struct page_state *ret, struct zone *zone)
{
int cpu = 0;
+ for (; cpu < NR_CPUS; cpu++) {
+ ret->nr_dirty += zone->page_state[cpu].nr_dirty;
+ ret->nr_writeback += zone->page_state[cpu].nr_writeback;
+ ret->nr_unstable += zone->page_state[cpu].nr_unstable;
+ ret->nr_page_table_pages+= zone->page_state[cpu].nr_page_table_pages;
+ ret->nr_mapped += zone->page_state[cpu].nr_mapped;
+ ret->nr_slab += zone->page_state[cpu].nr_slab;
+ }
+}
+
+/*
+ * Get the zone-specific page stats, summed across all zones/cpus.
+ */
+void get_page_state(struct page_state *ret)
+{
+ struct zone *zone;
+
memset(ret, 0, sizeof(*ret));
+ for_each_zone(zone) {
+ get_page_state_zone(ret, zone);
+ }
+}
+
+/*
+ * Get system-wide page stats, summed across all cpus.
+ */
+void get_full_page_state(struct page_state *ret)
+{
+ int cpu = 0;
+
+ /* Get the per-zone stats */
+ get_page_state(ret);
+
while (cpu < NR_CPUS) {
unsigned long *in, *out, off;
@@ -877,26 +913,11 @@ void __get_page_state(struct page_state
if (cpu < NR_CPUS && cpu_online(cpu))
prefetch(&per_cpu(page_states, cpu));
out = (unsigned long *)ret;
- for (off = 0; off < nr; off++)
+ for (off = 0; off < sizeof(*ret)/sizeof(unsigned long); off++)
*out++ += *in++;
}
}
-void get_page_state(struct page_state *ret)
-{
- int nr;
-
- nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
- nr /= sizeof(unsigned long);
-
- __get_page_state(ret, nr + 1);
-}
-
-void get_full_page_state(struct page_state *ret)
-{
- __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
-}
-
void get_zone_counts(unsigned long *active,
unsigned long *inactive, unsigned long *free)
{
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/mm/rmap.c linux-2.6.0-test9-setup_perzone_counters/mm/rmap.c
--- linux-2.6.0-test9-mm4/mm/rmap.c Sat Oct 25 11:44:44 2003
+++ linux-2.6.0-test9-setup_perzone_counters/mm/rmap.c Wed Nov 19 15:26:22 2003
@@ -178,7 +178,7 @@ page_add_rmap(struct page *page, pte_t *
if (page->pte.direct == 0) {
page->pte.direct = pte_paddr;
SetPageDirect(page);
- inc_page_state(nr_mapped);
+ inc_perzone_page_state(nr_mapped, page);
goto out;
}
@@ -272,7 +272,7 @@ void page_remove_rmap(struct page *page,
}
out:
if (!page_mapped(page))
- dec_page_state(nr_mapped);
+ dec_perzone_page_state(nr_mapped, page);
out_unlock:
pte_chain_unlock(page);
return;
@@ -453,7 +453,7 @@ int try_to_unmap(struct page * page)
}
out:
if (!page_mapped(page))
- dec_page_state(nr_mapped);
+ dec_perzone_page_state(nr_mapped, page);
return ret;
}
diff -Nurp --exclude-from=/home/mcd/.dontdiff linux-2.6.0-test9-mm4/mm/slab.c linux-2.6.0-test9-setup_perzone_counters/mm/slab.c
--- linux-2.6.0-test9-mm4/mm/slab.c Wed Nov 19 15:22:49 2003
+++ linux-2.6.0-test9-setup_perzone_counters/mm/slab.c Wed Nov 19 15:26:22 2003
@@ -832,9 +832,9 @@ static inline void kmem_freepages(kmem_c
while (i--) {
if (!TestClearPageSlab(page))
BUG();
+ dec_perzone_page_state(nr_slab, page);
page++;
}
- sub_page_state(nr_slab, nr_freed);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
free_pages((unsigned long)addr, cachep->gfporder);
@@ -1620,7 +1620,7 @@ static int cache_grow (kmem_cache_t * ca
do {
SET_PAGE_CACHE(page, cachep);
SET_PAGE_SLAB(page, slabp);
- inc_page_state(nr_slab);
+ inc_perzone_page_state(nr_slab, page);
page++;
} while (--i);
Matthew Dobson <[email protected]> wrote:
>
> Currently the VM decides to start doing background writeback of pages if
> 10% of the systems pages are dirty, and starts doing synchronous
> writeback of pages if 40% are dirty. This is great for smaller memory
> systems, but in larger memory systems (>2GB or so), a process can dirty
> ALL of lowmem (ZONE_NORMAL, 896MB) without hitting the 40% dirty page
> ratio needed to force the process to do writeback.
Yes, it has been that way for a year or so. I was wondering if anyone
would hit any problems in practice. Have you hit any problem in practice?
I agree that the per-zonification of this part of the VM/VFS makes some
sense, although not _complete_ sense, because as you've seen, we need to
perform writeout against all zones' pages if _any_ zone exceeds dirty
limits. This could do nasty things on a 1G highmem machine, due to the
tiny highmem zone. So maybe that zone should not trigger writeback.
However the simplest fix is of course to decrease the default value of the
dirty thresholds - put them back to the 2.4 levels. It all depends upon
the nature of the problems which you have been observing?
>> Currently the VM decides to start doing background writeback of pages if
>> 10% of the systems pages are dirty, and starts doing synchronous
>> writeback of pages if 40% are dirty. This is great for smaller memory
>> systems, but in larger memory systems (>2GB or so), a process can dirty
>> ALL of lowmem (ZONE_NORMAL, 896MB) without hitting the 40% dirty page
>> ratio needed to force the process to do writeback.
>
> Yes, it has been that way for a year or so. I was wondering if anyone
> would hit any problems in practice. Have you hit any problem in practice?
>
> I agree that the per-zonification of this part of the VM/VFS makes some
> sense, although not _complete_ sense, because as you've seen, we need to
> perform writeout against all zones' pages if _any_ zone exceeds dirty
> limits. This could do nasty things on a 1G highmem machine, due to the
> tiny highmem zone. So maybe that zone should not trigger writeback.
>
> However the simplest fix is of course to decrease the default value of the
> dirty thresholds - put them back to the 2.4 levels. It all depends upon
> the nature of the problems which you have been observing?
I'm not sure that'll fix the problem for NUMA boxes, which is where we
started. When any node fills up completely with dirty pages (which would
only require one process doing a streaming write (eg an ftp download),
it seems we'll get into trouble. If we change the thresholds from 40% to
20%, that just means you need a slightly larger system to trigger it,
it never fixes the problem ;-(
M.
"Martin J. Bligh" <[email protected]> wrote:
>
> >> Currently the VM decides to start doing background writeback of pages if
> >> 10% of the systems pages are dirty, and starts doing synchronous
> >> writeback of pages if 40% are dirty. This is great for smaller memory
> >> systems, but in larger memory systems (>2GB or so), a process can dirty
> >> ALL of lowmem (ZONE_NORMAL, 896MB) without hitting the 40% dirty page
> >> ratio needed to force the process to do writeback.
> >
> > Yes, it has been that way for a year or so. I was wondering if anyone
> > would hit any problems in practice. Have you hit any problem in practice?
> >
> > I agree that the per-zonification of this part of the VM/VFS makes some
> > sense, although not _complete_ sense, because as you've seen, we need to
> > perform writeout against all zones' pages if _any_ zone exceeds dirty
> > limits. This could do nasty things on a 1G highmem machine, due to the
> > tiny highmem zone. So maybe that zone should not trigger writeback.
> >
> > However the simplest fix is of course to decrease the default value of the
> > dirty thresholds - put them back to the 2.4 levels. It all depends upon
> > the nature of the problems which you have been observing?
>
> I'm not sure that'll fix the problem for NUMA boxes, which is where we
> started.
What problems?
> When any node fills up completely with dirty pages (which would
> only require one process doing a streaming write (eg an ftp download),
> it seems we'll get into trouble.
What trouble?
> If we change the thresholds from 40% to
> 20%, that just means you need a slightly larger system to trigger it,
> it never fixes the problem ;-(
What problem?
If we make the dirty threshold a proportion of the initial amount of free
memory in ZONE_NORMAL, as is done in 2.4 it will not be possible to fill
any node with dirty pages.
>> >> Currently the VM decides to start doing background writeback of pages if
>> >> 10% of the systems pages are dirty, and starts doing synchronous
>> >> writeback of pages if 40% are dirty. This is great for smaller memory
>> >> systems, but in larger memory systems (>2GB or so), a process can dirty
>> >> ALL of lowmem (ZONE_NORMAL, 896MB) without hitting the 40% dirty page
>> >> ratio needed to force the process to do writeback.
>> >
>> > Yes, it has been that way for a year or so. I was wondering if anyone
>> > would hit any problems in practice. Have you hit any problem in practice?
>> >
>> > I agree that the per-zonification of this part of the VM/VFS makes some
>> > sense, although not _complete_ sense, because as you've seen, we need to
>> > perform writeout against all zones' pages if _any_ zone exceeds dirty
>> > limits. This could do nasty things on a 1G highmem machine, due to the
>> > tiny highmem zone. So maybe that zone should not trigger writeback.
>> >
>> > However the simplest fix is of course to decrease the default value of the
>> > dirty thresholds - put them back to the 2.4 levels. It all depends upon
>> > the nature of the problems which you have been observing?
>>
>> I'm not sure that'll fix the problem for NUMA boxes, which is where we
>> started.
>
> What problems?
>
>> When any node fills up completely with dirty pages (which would
>> only require one process doing a streaming write (eg an ftp download),
>> it seems we'll get into trouble.
>
> What trouble?
Well ... not so sure of this as I once was ... so be gentle with me ;-)
But if the system has been running for a while, memory is full of pagecache,
etc. We try to allocate from the local node, fail, and fall back to the
other nodes, which are all full as well. Then we wake up kswapd, but all
pages in this node are dirty, so we block for ages on writeout, making
mem allocate really latent and slow (which was presumably what
balance_dirty_pages was there to solve in the first place).
> If we make the dirty threshold a proportion of the initial amount of free
> memory in ZONE_NORMAL, as is done in 2.4 it will not be possible to fill
> any node with dirty pages.
True. But that seems a bit extreme for a system with 64GB of RAM, and only
896Mb in ZONE_NORMAL ;-) Doesn't really seem like the right way to fix it.
M.
"Martin J. Bligh" <[email protected]> wrote:
>
> > What trouble?
>
> Well ... not so sure of this as I once was ... so be gentle with me ;-)
> But if the system has been running for a while, memory is full of pagecache,
> etc. We try to allocate from the local node, fail, and fall back to the
> other nodes, which are all full as well. Then we wake up kswapd, but all
> pages in this node are dirty, so we block for ages on writeout, making
> mem allocate really latent and slow (which was presumably what
> balance_dirty_pages was there to solve in the first place).
It is possible. You'd be pretty unlucky to dirty so much lowmem when there
is such a huge amount of highmem floating about, but yes, if you tried hard
enough...
I have a feeling that some observed problem must have prompted this coding
frenzy from Matthew. Surely some problem was observed, and this patch
fixed it up??
> > If we make the dirty threshold a proportion of the initial amount of free
> > memory in ZONE_NORMAL, as is done in 2.4 it will not be possible to fill
> > any node with dirty pages.
>
> True. But that seems a bit extreme for a system with 64GB of RAM, and only
> 896Mb in ZONE_NORMAL ;-) Doesn't really seem like the right way to fix it.
>
Increasing /proc/sys/vm/lower_zone_protection can be used to teach the VM
to not use lowmem for pagecache. Does this solve the elusive problem too?
>> Well ... not so sure of this as I once was ... so be gentle with me ;-)
>> But if the system has been running for a while, memory is full of pagecache,
>> etc. We try to allocate from the local node, fail, and fall back to the
>> other nodes, which are all full as well. Then we wake up kswapd, but all
>> pages in this node are dirty, so we block for ages on writeout, making
>> mem allocate really latent and slow (which was presumably what
>> balance_dirty_pages was there to solve in the first place).
>
> It is possible. You'd be pretty unlucky to dirty so much lowmem when there
> is such a huge amount of highmem floating about, but yes, if you tried hard
> enough...
I'm not really worried about lowmem vs highem - that was almost an
afterthought. I'm more worried about the NUMA bit - it's easy to fill
one node's memory completely with dirty pages by just a writer running
on that node.
> I have a feeling that some observed problem must have prompted this coding
> frenzy from Matthew. Surely some problem was observed, and this patch
> fixed it up??
No, just an observation whilst looking at balance_dirty_pages, that it's
not working as intended on NUMA. It's just easy to goad Matt into a frenzy,
I guess ;-) ;-)
"dd if=/dev/zero of=foo" would trigger it, I'd think. Watching the IO
rate, it should go wierd after ram is full (on a 3 or more node system,
so there's < 40% of RAM for each node). Yeah, I know you're going to give
me crap for not actually trying it ... and rightly so ... but it just
seemed so obvious ... ;-)
>> > If we make the dirty threshold a proportion of the initial amount of free
>> > memory in ZONE_NORMAL, as is done in 2.4 it will not be possible to fill
>> > any node with dirty pages.
>>
>> True. But that seems a bit extreme for a system with 64GB of RAM, and only
>> 896Mb in ZONE_NORMAL ;-) Doesn't really seem like the right way to fix it.
>>
>
> Increasing /proc/sys/vm/lower_zone_protection can be used to teach the VM
> to not use lowmem for pagecache. Does this solve the elusive problem too?
Don't think so - see comment above re NUMA.
M.
"Martin J. Bligh" <[email protected]> wrote:
>
> "dd if=/dev/zero of=foo" would trigger it, I'd think. Watching the IO
> rate, it should go wierd after ram is full (on a 3 or more node system,
> so there's < 40% of RAM for each node).
We should just prod kswapd into cleansing the relevant zone(s) and go do
allocation from the next one.
> Yeah, I know you're going to give me crap for not actually trying it
How much would you like?
(Wanders off, wondering how to fix a problem which cannot be
demonstrated).
"Martin J. Bligh" <[email protected]> wrote:
>
> "dd if=/dev/zero of=foo" would trigger it, I'd think. Watching the IO
> rate, it should go wierd after ram is full (on a 3 or more node system,
> so there's < 40% of RAM for each node).
Also, note that page_writeback_init() will not allow 40% of memory to be
dirtied on such a system. it is set much lower, partly to avoid an
explosion of unreclaimable buffer_heads.
bk revtool sez:
- Allowing 40% of physical memory to be dirtied on massive ia32 boxes
is unreasonable. It pins too many buffer_heads and contribues to
page reclaim latency.
The patch changes the initial value of
/proc/sys/vm/dirty_background_ratio, dirty_async_ratio and (the
presently non-functional) dirty_sync_ratio so that they are reduced
when the highmem:lowmem ratio exceeds 4:1.
These ratios are scaled so that as the highmem:lowmem ratio goes
beyond 4:1, the maximum amount of allowed dirty memory ceases to
increase. It is clamped at the amount of memory which a 4:1 machine
is allowed to use.
- Aggressive reduction in the dirty memory threshold at which
background writeback cuts in. 2.4 uses 30% of ZONE_NORMAL. 2.5 uses
40% of total memory. This patch changes it to 10% of total memory
(if total memory <= 4G. Even less otherwise - see above).
This means that:
- Much more writeback is performed by pdflush.
- When the application is generating dirty data at a moderate
rate, background writeback cuts in much earlier, so memory is
cleaned more promptly.
- Reduces the risk of user applications getting stalled by writeback.
- Will damage dbench numbers. It turns out that the damage is
fairly small, and dbench isn't a worthwhile workload for
optimisation.
- Moderate reduction in the dirty level at which the write(2) caller
is forced to perform writeback (throttling). Was 40% of total
memory. Is now 30% of total memory (if total memory <= 4G, less
otherwise).
This is to reduce page reclaim latency, and generally because
allowing processes to flood the machine with dirty data is a bad
thing in mixed workloads.