Currently the slab allocator uses a page_state counter called nr_slab.
The VM swap prefetch code assumes that this describes the number of pages
used on a node by the slab allocator. However, that is not really true.
Currently nr_slab is the number of total pages allocated which may
be local or remote pages. Remote allocations may artificially inflate
nr_slab and therefore disable swap prefetching.
This patch splits the counter into the nr_local_slab which reflects
slab pages allocated from the local zones (and this number is useful
at least as a guidance for the VM) and the remotely allocated pages.
However, there is currently no counter reflecting the number of pages
used in a zone/node for the slab although the proc statistics give
an impression otherwise.
We cannot update counters from other nodes since these counters are per cpu.
A counter could be put into struct zone however that would require to
take a spinlock for each update.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.14-mm1/mm/slab.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/slab.c 2005-11-10 12:59:49.000000000 -0800
+++ linux-2.6.14-mm1/mm/slab.c 2005-11-10 13:00:11.000000000 -0800
@@ -1201,7 +1201,12 @@ static void *kmem_getpages(kmem_cache_t
i = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
atomic_add(i, &slab_reclaim_pages);
- add_page_state(nr_slab, i);
+
+ if (page_to_nid(page) == numa_node_id())
+ add_page_state(nr_local_slab, i);
+ else
+ add_page_state(nr_remote_slab, i);
+
while (i--) {
SetPageSlab(page);
page++;
@@ -1223,7 +1228,10 @@ static void kmem_freepages(kmem_cache_t
BUG();
page++;
}
- sub_page_state(nr_slab, nr_freed);
+ if (page_to_nid(page) == numa_node_id())
+ sub_page_state(nr_local_slab, nr_freed);
+ else
+ sub_page_state(nr_remote_slab, nr_freed);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
free_pages((unsigned long)addr, cachep->gfporder);
Index: linux-2.6.14-mm1/drivers/base/node.c
===================================================================
--- linux-2.6.14-mm1.orig/drivers/base/node.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14-mm1/drivers/base/node.c 2005-11-10 13:00:11.000000000 -0800
@@ -55,8 +55,10 @@ static ssize_t node_read_meminfo(struct
ps.nr_writeback = 0;
if ((long)ps.nr_mapped < 0)
ps.nr_mapped = 0;
- if ((long)ps.nr_slab < 0)
- ps.nr_slab = 0;
+ if ((long)ps.nr_local_slab < 0)
+ ps.nr_local_slab = 0;
+ if ((long)ps.nr_remote_slab < 0)
+ ps.nr_remote_slab = 0;
n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
@@ -71,7 +73,8 @@ static ssize_t node_read_meminfo(struct
"Node %d Dirty: %8lu kB\n"
"Node %d Writeback: %8lu kB\n"
"Node %d Mapped: %8lu kB\n"
- "Node %d Slab: %8lu kB\n",
+ "Node %d LocalSlab: %8lu kB\n"
+ "Node %d RemoteSlab: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
nid, K(i.totalram - i.freeram),
@@ -84,7 +87,8 @@ static ssize_t node_read_meminfo(struct
nid, K(ps.nr_dirty),
nid, K(ps.nr_writeback),
nid, K(ps.nr_mapped),
- nid, K(ps.nr_slab));
+ nid, K(ps.nr_local_slab),
+ nid, K(ps.nr_remote_slab));
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
Index: linux-2.6.14-mm1/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.14-mm1.orig/fs/proc/proc_misc.c 2005-11-09 10:47:35.000000000 -0800
+++ linux-2.6.14-mm1/fs/proc/proc_misc.c 2005-11-10 13:00:11.000000000 -0800
@@ -168,7 +168,8 @@ static int meminfo_read_proc(char *page,
"Dirty: %8lu kB\n"
"Writeback: %8lu kB\n"
"Mapped: %8lu kB\n"
- "Slab: %8lu kB\n"
+ "LocalSlab: %8lu kB\n"
+ "RemoteSlab: %8lu kB\n"
"CommitLimit: %8lu kB\n"
"Committed_AS: %8lu kB\n"
"PageTables: %8lu kB\n"
@@ -191,7 +192,8 @@ static int meminfo_read_proc(char *page,
K(ps.nr_dirty),
K(ps.nr_writeback),
K(ps.nr_mapped),
- K(ps.nr_slab),
+ K(ps.nr_local_slab),
+ K(ps.nr_remote_slab),
K(allowed),
K(committed),
K(ps.nr_page_table_pages),
Index: linux-2.6.14-mm1/mm/swap_prefetch.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/swap_prefetch.c 2005-11-10 11:33:03.000000000 -0800
+++ linux-2.6.14-mm1/mm/swap_prefetch.c 2005-11-10 13:00:11.000000000 -0800
@@ -327,7 +327,7 @@ static int prefetch_suitable(void)
* >2/3 of the ram is mapped or swapcache, we need some free for
* pagecache
*/
- limit = ps.nr_mapped + ps.nr_slab + pending_writes +
+ limit = ps.nr_mapped + ps.nr_local_slab + pending_writes +
total_swapcache_pages;
if (limit > mapped_limit)
goto out;
Index: linux-2.6.14-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/page_alloc.c 2005-11-09 10:47:37.000000000 -0800
+++ linux-2.6.14-mm1/mm/page_alloc.c 2005-11-10 13:00:11.000000000 -0800
@@ -1423,14 +1423,15 @@ void show_free_areas(void)
K(nr_free_highpages()));
printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u localslab:%lu remoteslab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
- ps.nr_slab,
+ ps.nr_local_slab,
+ ps.nr_remote_slab,
ps.nr_mapped,
ps.nr_page_table_pages);
@@ -2312,7 +2313,8 @@ static char *vmstat_text[] = {
"nr_unstable",
"nr_page_table_pages",
"nr_mapped",
- "nr_slab",
+ "nr_local_slab",
+ "nr_remote_slab",
"pgpgin",
"pgpgout",
Index: linux-2.6.14-mm1/include/linux/page-flags.h
===================================================================
--- linux-2.6.14-mm1.orig/include/linux/page-flags.h 2005-11-09 10:47:29.000000000 -0800
+++ linux-2.6.14-mm1/include/linux/page-flags.h 2005-11-10 13:00:11.000000000 -0800
@@ -87,8 +87,9 @@ struct page_state {
unsigned long nr_unstable; /* NFS unstable pages */
unsigned long nr_page_table_pages;/* Pages used for pagetables */
unsigned long nr_mapped; /* mapped into pagetables */
- unsigned long nr_slab; /* In slab */
-#define GET_PAGE_STATE_LAST nr_slab
+ unsigned long nr_local_slab; /* local slab pages */
+ unsigned long nr_remote_slab; /* remote slab pages */
+#define GET_PAGE_STATE_LAST nr_remote_slab
/*
* The below are zeroed by get_page_state(). Use get_full_page_state()
Hi Christoph
On Fri, 11 Nov 2005 08:55, Christoph Lameter wrote:
> Currently the slab allocator uses a page_state counter called nr_slab.
> The VM swap prefetch code assumes that this describes the number of pages
> used on a node by the slab allocator. However, that is not really true.
>
> Currently nr_slab is the number of total pages allocated which may
> be local or remote pages. Remote allocations may artificially inflate
> nr_slab and therefore disable swap prefetching.
Thanks for pointing this out.
> This patch splits the counter into the nr_local_slab which reflects
> slab pages allocated from the local zones (and this number is useful
> at least as a guidance for the VM) and the remotely allocated pages.
How large a contribution is the remote slab size likely to be? Would this
information be useful to anyone potentially in future code besides swap
prefetch? The nature of prefetch is that this is only a fairly coarse measure
of how full the vm is with data we don't want to displace. Thus it is also
not important that it is very accurate.
Unless the remote slab size can be a very large contribution, or having local
and remote slab sizes is useful potentially to some other code I'm inclined
to say this is unnecessary. A simple comment saying something like "the
nr_slab estimation is artificially elevated by remote slab pages on numa,
however this contribution is not important to the accuracy of this
algorithm". Of course it is nice to be more accurate and if you think
worthwhile then we can do this - I'll be happy to be guided by your
judgement.
As a side note I doubt any serious size numa hardware will ever be idle enough
by swap prefetch standards to even start prefetching swap pages. If you think
hardware of this sort is likely to benefit from swap prefetch then perhaps we
should look at relaxing the conditions under which prefetching occurs.
Cheers,
Con
On Fri, 11 Nov 2005, Con Kolivas wrote:
> > This patch splits the counter into the nr_local_slab which reflects
> > slab pages allocated from the local zones (and this number is useful
> > at least as a guidance for the VM) and the remotely allocated pages.
>
> How large a contribution is the remote slab size likely to be? Would this
> information be useful to anyone potentially in future code besides swap
> prefetch? The nature of prefetch is that this is only a fairly coarse measure
> of how full the vm is with data we don't want to displace. Thus it is also
> not important that it is very accurate.
The size of the remote cache depends on many factors. The application can
influence that by setting memory policies.
> Unless the remote slab size can be a very large contribution, or having local
Yes it can be quite large. On some of my tests with applications these are
100%. This is typical if the application sets the policy in such a way
that all allocations are off node or if the kernel has to allocate memory
on a certain node for a device.
> and remote slab sizes is useful potentially to some other code I'm inclined
> to say this is unnecessary. A simple comment saying something like "the
> nr_slab estimation is artificially elevated by remote slab pages on numa,
> however this contribution is not important to the accuracy of this
> algorithm". Of course it is nice to be more accurate and if you think
> worthwhile then we can do this - I'll be happy to be guided by your
> judgement.
> As a side note I doubt any serious size numa hardware will ever be idle enough
> by swap prefetch standards to even start prefetching swap pages. If you think
> hardware of this sort is likely to benefit from swap prefetch then perhaps we
> should look at relaxing the conditions under which prefetching occurs.
Small scale NUMA machines may benefit from swap prefetch but on larger
machines people usually try to avoid swap altogether.
On Fri, 11 Nov 2005 10:13, Christoph Lameter wrote:
> On Fri, 11 Nov 2005, Con Kolivas wrote:
> > > This patch splits the counter into the nr_local_slab which reflects
> > > slab pages allocated from the local zones (and this number is useful
> > > at least as a guidance for the VM) and the remotely allocated pages.
> >
> > How large a contribution is the remote slab size likely to be? Would this
> > information be useful to anyone potentially in future code besides swap
> > prefetch? The nature of prefetch is that this is only a fairly coarse
> > measure of how full the vm is with data we don't want to displace. Thus
> > it is also not important that it is very accurate.
>
> The size of the remote cache depends on many factors. The application can
> influence that by setting memory policies.
>
> > Unless the remote slab size can be a very large contribution, or having
> > local
>
> Yes it can be quite large. On some of my tests with applications these are
> 100%. This is typical if the application sets the policy in such a way
> that all allocations are off node or if the kernel has to allocate memory
> on a certain node for a device.
Great. Thanks for the information, and I prefer to see this patch in on that
basis.
> > As a side note I doubt any serious size numa hardware will ever be idle
> > enough by swap prefetch standards to even start prefetching swap pages.
> > If you think hardware of this sort is likely to benefit from swap
> > prefetch then perhaps we should look at relaxing the conditions under
> > which prefetching occurs.
>
> Small scale NUMA machines may benefit from swap prefetch but on larger
> machines people usually try to avoid swap altogether.
Then I won't alter the when-to-prefetch algorithm.
Thanks!
Con
On Fri, 11 Nov 2005 10:13 am, Christoph Lameter wrote:
> On Fri, 11 Nov 2005, Con Kolivas wrote:
> > > This patch splits the counter into the nr_local_slab which reflects
> > > slab pages allocated from the local zones (and this number is useful
> > > at least as a guidance for the VM) and the remotely allocated pages.
> >
> > How large a contribution is the remote slab size likely to be? Would this
> > information be useful to anyone potentially in future code besides swap
> > prefetch? The nature of prefetch is that this is only a fairly coarse
> > measure of how full the vm is with data we don't want to displace. Thus
> > it is also not important that it is very accurate.
>
> The size of the remote cache depends on many factors. The application can
> influence that by setting memory policies.
>
> > Unless the remote slab size can be a very large contribution, or having
> > local
>
> Yes it can be quite large. On some of my tests with applications these are
> 100%. This is typical if the application sets the policy in such a way
> that all allocations are off node or if the kernel has to allocate memory
> on a certain node for a device.
One last thing. Swap prefetch works off the accounting of total memory and is
only a single kernel thread rather than a thread per cpu or per pgdat unlike
kswapd. Currently it just cares about total slab data and total ram.
Depending on where this thread is scheduled (which node) your accounting
change will alter the behaviour of it. Does this affect the relevance of this
patch to you?
Cheers,
Con
On Fri, 11 Nov 2005, Con Kolivas wrote:
> One last thing. Swap prefetch works off the accounting of total memory and is
> only a single kernel thread rather than a thread per cpu or per pgdat unlike
> kswapd. Currently it just cares about total slab data and total ram.
> Depending on where this thread is scheduled (which node) your accounting
> change will alter the behaviour of it. Does this affect the relevance of this
> patch to you?
Yes, if its a truly global value then we would not need the patch.
But then the prefetch code would have to add up all the nr_slab field for
all processors and use that result for comparison. If you do this in a
node specific fashion then the problem comes up again.