2005-03-10 20:49:35

by Christoph Lameter

[permalink] [raw]
Subject: [PATCH] add a clear_pages function to clear pages of higher order

The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
clear_page that is capable of zeroing multiple pages at once. The following patch adds
a function "clear_pages" that is capable of clearing multiple continuous pages at once.

This used to be part of the prezeroing patchset but there may be benefits
to huge pages and regular kernel code as well. Also Mel Gorman's patchset
to reduce fragmentation and introduce prezeroing in a different way may
benefit from this patch. The patch only provides a clear_pages function
for ia32, ia64, x86_64 and sparc64 (all tested). Other platforms may
provide a clear_pages function by defining __HAVE_ARCH_CLEAR_PAGES.

Patch against 2.6.11-bk6

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-10 10:57:06.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-10 10:57:10.000000000 -0800
@@ -628,11 +628,19 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef __HAVE_ARCH_CLEAR_PAGES
+ if (!PageHighMem(page)) {
+ clear_pages(page_address(page), order);
+ return;
+ }
+#endif
+
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.11/mm/hugetlb.c
===================================================================
--- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800
+++ linux-2.6.11/mm/hugetlb.c 2005-03-10 10:57:10.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER);
return page;
}

Index: linux-2.6.11/include/asm-ia64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -56,8 +56,10 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_pages (void *page, int order);
extern void copy_page (void *to, void *from);
+#define clear_page(__page) clear_pages(__page, 0)
+#define __HAVE_ARCH_CLEAR_PAGES

/*
* clear_user_page() and copy_user_page() can't be inline functions because
Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-10 10:57:10.000000000 -0800
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock);
EXPORT_SYMBOL(__up);

#include <asm/page.h>
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);

#ifdef CONFIG_VIRTUAL_MEM_MAP
#include <linux/bootmem.h>
Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

-GLOBAL_ENTRY(clear_page)
+GLOBAL_ENTRY(clear_pages)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
@@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page)
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
-END(clear_page)
+END(clear_pages)
Index: linux-2.6.11/include/asm-i386/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/include/asm-i386/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_pages(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,11 +28,13 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_pages(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(page) clear_pages(page, 0)
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

Index: linux-2.6.11/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-10 10:57:10.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.11/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-10 10:57:10.000000000 -0800
@@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void *
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.11/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -32,8 +32,10 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_pages(void *, int);
void copy_page(void *, void *);
+#define __HAVE_ARCH_CLEAR_PAGES
+#define clear_page(__page) clear_pages(__page, 0)

#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-10 10:57:10.000000000 -0800
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);
#endif

EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);

EXPORT_SYMBOL(cpu_pda);
#ifdef CONFIG_SMP
Index: linux-2.6.11/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -1,12 +1,16 @@
/*
* Zero a page.
* rdi page
+ * rsi order
*/
- .globl clear_page
+ .globl clear_pages
.p2align 4
-clear_page:
+clear_pages:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
- movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -23,7 +27,7 @@ clear_page:
jnz .Lloop
nop
ret
-clear_page_end:
+clear_pages_end:

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
@@ -32,19 +36,22 @@ clear_page_end:

.section .altinstructions,"a"
.align 8
- .quad clear_page
- .quad clear_page_c
+ .quad clear_pages
+ .quad clear_pages_c
.byte X86_FEATURE_K8_C
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
+ .byte clear_pages_end-clear_pages
+ .byte clear_pages_c_end-clear_pages_c
.previous

.section .altinstr_replacement,"ax"
-clear_page_c:
- movl $4096/8,%ecx
+clear_pages_c:
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
rep
stosq
ret
-clear_page_c_end:
+clear_pages_c_end:
.previous
Index: linux-2.6.11/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-10 10:57:10.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
Index: linux-2.6.11/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800
+++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-10 10:57:10.000000000 -0800
@@ -14,8 +14,10 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, int order);
+#define clear_page(X) _clear_page((void *)(X), 0)
+#define clear_pages _clear_page
+
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.11/include/linux/gfp.h
===================================================================
--- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800
+++ linux-2.6.11/include/linux/gfp.h 2005-03-10 10:57:10.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags);
#endif /* __LINUX_GFP_H */


2005-03-10 21:42:27

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 2005-03-10 at 12:35 -0800, Christoph Lameter wrote:
> +#ifdef __HAVE_ARCH_CLEAR_PAGES
> + if (!PageHighMem(page)) {
> + clear_pages(page_address(page), order);
> + return;
> + }
> +#endif
> +
> for(i = 0; i < (1 << order); i++)
> clear_highpage(page + i);
> }
...
> --- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
> +++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 10:57:10.000000000 -0800
> @@ -56,8 +56,10 @@
> # ifdef __KERNEL__
> # define STRICT_MM_TYPECHECKS
>
> -extern void clear_page (void *page);
> +extern void clear_pages (void *page, int order);
> extern void copy_page (void *to, void *from);
> +#define clear_page(__page) clear_pages(__page, 0)
> +#define __HAVE_ARCH_CLEAR_PAGES

Although this is a simple instance, could this please be done in a
Kconfig file? If that #define happens inside of other #ifdefs, it can
be quite hard to decipher the special .config incantation to get it set.
On the other hand, if the dependencies are spelled out in a Kconfig
entry...

BTW, I tried applying this to 2.6.11-bk6, and it rejected:
...
patching file include/asm-i386/page.h
Hunk #2 FAILED at 28.
1 out of 2 hunks FAILED -- saving rejects to file
include/asm-i386/page.h.rej
...

There were some more rejects as well. Were there some other patches
applied first?

-- Dave

2005-03-10 22:56:01

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 10 Mar 2005, Dave Hansen wrote:

> > +extern void clear_pages (void *page, int order);
> > extern void copy_page (void *to, void *from);
> > +#define clear_page(__page) clear_pages(__page, 0)
> > +#define __HAVE_ARCH_CLEAR_PAGES
>
> Although this is a simple instance, could this please be done in a
> Kconfig file? If that #define happens inside of other #ifdefs, it can
> be quite hard to decipher the special .config incantation to get it set.
> On the other hand, if the dependencies are spelled out in a Kconfig
> entry...

Ok will do.

> BTW, I tried applying this to 2.6.11-bk6, and it rejected:
> ...
> patching file include/asm-i386/page.h
> Hunk #2 FAILED at 28.
> 1 out of 2 hunks FAILED -- saving rejects to file
> include/asm-i386/page.h.rej
> ...
>
> There were some more rejects as well. Were there some other patches
> applied first?

Patches work fine here.

2005-03-11 01:06:37

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

Changelog:
- use Kconfig and CONFIG_CLEAR_PAGES

The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
clear_page that is capable of zeroing multiple pages at once. The following patch adds
a function "clear_pages" that is capable of clearing multiple continuous pages at once.

Patch against 2.6.11-bk6

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-10 14:42:43.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-10 15:01:53.000000000 -0800
@@ -628,11 +628,19 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef CONFIG_CLEAR_PAGES
+ if (!PageHighMem(page)) {
+ clear_pages(page_address(page), order);
+ return;
+ }
+#endif
+
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.11/mm/hugetlb.c
===================================================================
--- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800
+++ linux-2.6.11/mm/hugetlb.c 2005-03-10 15:01:53.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER);
return page;
}

Index: linux-2.6.11/include/asm-ia64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-10 15:02:47.000000000 -0800
@@ -56,8 +56,9 @@
# ifdef __KERNEL__
# define STRICT_MM_TYPECHECKS

-extern void clear_page (void *page);
+extern void clear_pages (void *page, int order);
extern void copy_page (void *to, void *from);
+#define clear_page(__page) clear_pages(__page, 0)

/*
* clear_user_page() and copy_user_page() can't be inline functions because
Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-10 15:01:53.000000000 -0800
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(__down_trylock);
EXPORT_SYMBOL(__up);

#include <asm/page.h>
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);

#ifdef CONFIG_VIRTUAL_MEM_MAP
#include <linux/bootmem.h>
Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800
@@ -7,6 +7,7 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 12/10/04 clameter Make it work on pages of order size
*/
#include <linux/config.h>

@@ -29,27 +30,33 @@
#define dst4 r11

#define dst_last r31
+#define totsize r14

-GLOBAL_ENTRY(clear_page)
+GLOBAL_ENTRY(clear_pages)
.prologue
- .regstk 1,0,0,0
- mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
.save ar.lc, saved_lc
mov saved_lc = ar.lc
-
+ ;;
.body
+ adds dst1 = 16, in0
mov ar.lc = (PREFETCH_LINES - 1)
mov dst_fetch = in0
- adds dst1 = 16, in0
adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
;;
.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
;;
- addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
mov ar.lc = r16 // one L3 line per iteration
- adds dst4 = 64, in0
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
@@ -74,4 +81,4 @@ GLOBAL_ENTRY(clear_page)
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
-END(clear_page)
+END(clear_pages)
Index: linux-2.6.11/include/asm-i386/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/include/asm-i386/page.h 2005-03-10 15:02:59.000000000 -0800
@@ -18,7 +18,7 @@

#include <asm/mmx.h>

-#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_pages(page, order) mmx_clear_page((void *)(page),order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -28,11 +28,12 @@
* Maybe the K6-III ?
*/

-#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+#define clear_pages(page, order) memset((void *)(page), 0, PAGE_SIZE << (order))
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif

+#define clear_page(page) clear_pages(page, 0)
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)

Index: linux-2.6.11/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-10 15:01:53.000000000 -0800
@@ -8,7 +8,7 @@
#include <linux/types.h>

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
+extern void mmx_clear_page(void *page, int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.11/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-10 15:01:53.000000000 -0800
@@ -128,7 +128,7 @@ void *_mmx_memcpy(void *to, const void *
* other MMX using processors do not.
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -138,7 +138,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/64;i++)
+ for(i=0;i<((4096/64) << order);i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
@@ -257,7 +257,7 @@ static void fast_copy_page(void *to, voi
* Generic MMX implementation without K7 specific streaming
*/

-static void fast_clear_page(void *page)
+static void fast_clear_page(void *page, int order)
{
int i;

@@ -267,7 +267,7 @@ static void fast_clear_page(void *page)
" pxor %%mm0, %%mm0\n" : :
);

- for(i=0;i<4096/128;i++)
+ for(i=0;i<((4096/128) << order);i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
@@ -359,23 +359,23 @@ static void fast_copy_page(void *to, voi
* Favour MMX for page clear and copy.
*/

-static void slow_zero_page(void * page)
+static void slow_clear_page(void * page, int order)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
+ :"a" (0),"1" (page),"0" (1024 << order)
:"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void * page, int order)
{
if(unlikely(in_interrupt()))
- slow_zero_page(page);
+ slow_clear_page(page, order);
else
- fast_clear_page(page);
+ fast_clear_page(page, order);
}

static void slow_copy_page(void *to, void *from)
Index: linux-2.6.11/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-10 15:03:10.000000000 -0800
@@ -32,8 +32,9 @@
#ifdef __KERNEL__
#ifndef __ASSEMBLY__

-void clear_page(void *);
+void clear_pages(void *, int);
void copy_page(void *, void *);
+#define clear_page(__page) clear_pages(__page, 0)

#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-10 15:01:53.000000000 -0800
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);
#endif

EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_pages);

EXPORT_SYMBOL(cpu_pda);
#ifdef CONFIG_SMP
Index: linux-2.6.11/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800
@@ -1,12 +1,16 @@
/*
* Zero a page.
* rdi page
+ * rsi order
*/
- .globl clear_page
+ .globl clear_pages
.p2align 4
-clear_page:
+clear_pages:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
- movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -23,7 +27,7 @@ clear_page:
jnz .Lloop
nop
ret
-clear_page_end:
+clear_pages_end:

/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
@@ -32,19 +36,22 @@ clear_page_end:

.section .altinstructions,"a"
.align 8
- .quad clear_page
- .quad clear_page_c
+ .quad clear_pages
+ .quad clear_pages_c
.byte X86_FEATURE_K8_C
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
+ .byte clear_pages_end-clear_pages
+ .byte clear_pages_c_end-clear_pages_c
.previous

.section .altinstr_replacement,"ax"
-clear_page_c:
- movl $4096/8,%ecx
+clear_pages_c:
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
xorl %eax,%eax
rep
stosq
ret
-clear_page_c_end:
+clear_pages_c_end:
.previous
Index: linux-2.6.11/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-10 15:01:53.000000000 -0800
@@ -28,9 +28,12 @@
.text

.globl _clear_page
-_clear_page: /* %o0=dest */
+_clear_page: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
ba,pt %xcc, clear_page_common
- clr %o4
+ sllx %o2, %o1, %o1

/* This thing is pretty important, it shows up
* on the profiles via do_anonymous_page().
@@ -69,16 +72,16 @@ clear_user_page: /* %o0=dest, %o1=vaddr
flush %g6
wrpr %o4, 0x0, %pstate

+ sethi %hi(PAGE_SIZE/64), %o1
mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1

clear_page_common:
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
- sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
Index: linux-2.6.11/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800
+++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-10 15:03:43.000000000 -0800
@@ -14,8 +14,10 @@

#ifndef __ASSEMBLY__

-extern void _clear_page(void *page);
-#define clear_page(X) _clear_page((void *)(X))
+extern void _clear_page(void *page, int order);
+#define clear_page(X) _clear_page((void *)(X), 0)
+#define clear_pages _clear_page
+
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.11/include/linux/gfp.h
===================================================================
--- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800
+++ linux-2.6.11/include/linux/gfp.h 2005-03-10 15:01:53.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags);
#endif /* __LINUX_GFP_H */
Index: linux-2.6.11/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-10 14:42:41.000000000 -0800
+++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-10 15:01:53.000000000 -0800
@@ -78,6 +78,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_PAGES
+ bool
+ default y
+
source "init/Kconfig"


Index: linux-2.6.11/arch/i386/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-10 14:42:41.000000000 -0800
+++ linux-2.6.11/arch/i386/Kconfig 2005-03-10 15:01:53.000000000 -0800
@@ -33,6 +33,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_PAGES
+ bool
+ default y
+
source "init/Kconfig"

menu "Processor type and features"
Index: linux-2.6.11/arch/ia64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800
+++ linux-2.6.11/arch/ia64/Kconfig 2005-03-10 15:01:53.000000000 -0800
@@ -46,6 +46,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_PAGES
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
Index: linux-2.6.11/arch/sparc64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800
+++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-10 15:02:16.000000000 -0800
@@ -16,6 +16,10 @@ config TIME_INTERPOLATION
bool
default y

+config CLEAR_PAGES
+ bool
+ default y
+
source "init/Kconfig"

config SYSVIPC_COMPAT

2005-03-11 08:08:41

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Friday 11 March 2005 03:03, Christoph Lameter wrote:
> Changelog:
> - use Kconfig and CONFIG_CLEAR_PAGES
>
> The zeroing of a page of a arbitrary order in page_alloc.c and in hugetlb.c may benefit from a
> clear_page that is capable of zeroing multiple pages at once. The following patch adds
> a function "clear_pages" that is capable of clearing multiple continuous pages at once.
>
> Patch against 2.6.11-bk6
>
> Signed-off-by: Christoph Lameter <[email protected]>
[snip]
> -clear_page_end:
> +clear_pages_end:
>
> /* C stepping K8 run faster using the string instructions.
> It is also a lot simpler. Use this when possible */

Andi Kleen (iirc) says that non-temporal stores seem to be
big win in microbenchmarks (and I second that), but they are
a net loss when we are going to use zeroed page just after
zeroing. He recommends avoid using non-temporal stores

With this new page prezeroing infrastructure, that argument
most likely is not right anymore. Especially clearing of
high-order pages definitely will benefit from NT stores
because they do not kill L1 data cache in the process.

I don't have K8 and therefore cannot be 100% sure, but
I really doubt that K8 optimize "rep stosq" into _NT_ stores.

Andi?
--
vda

2005-03-17 01:35:11

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Fri, 11 Mar 2005, Denis Vlasenko wrote:

> Andi Kleen (iirc) says that non-temporal stores seem to be
> big win in microbenchmarks (and I second that), but they are
> a net loss when we are going to use zeroed page just after
> zeroing. He recommends avoid using non-temporal stores
>
> With this new page prezeroing infrastructure, that argument
> most likely is not right anymore. Especially clearing of
> high-order pages definitely will benefit from NT stores
> because they do not kill L1 data cache in the process.
>
> I don't have K8 and therefore cannot be 100% sure, but
> I really doubt that K8 optimize "rep stosq" into _NT_ stores.

Hmm. That would be interesting to know and may be necessary to justify
the continued existence of this patch. I tried to get some numbers on
the performance wins for zeroing larger pages with the patch as is (no
NT stores) and came up with:

Processor Performance Increase
----------------------------------------------------------------
Itanium 2 1.3Ghz M1/R5 1.5%
AMD Athlon 64 3200+ i386 mode 3%
AMD Athlon 64 3200+ x86_64 mode 3.3%

(this is if the zeroing engine is the cpu of course. Prezeroing
may be done through some DMA gizmo independent of the cpu)

Itanium has more extensive optimization capabilities and
seems to be able to better cope with the loop logic for regular
clear_page. Thus the improvement is even less on Itanium.

Numbers obtained with the following patch that allows to get performance
data from /proc/meminfo on zeroing performance (just divide Cycles by
Pages for clear_page and clear_pages):

Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-16 17:12:51.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-16 17:17:28.000000000 -0800
@@ -633,13 +633,33 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, unsigned int gfp_flags)
{
int i;
+ unsigned long t1;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef CONFIG_CLEAR_PAGES
+ if (!PageHighMem(page) && order>4) {
+ unsigned long t;
+
+ t1=get_cycles();
+ clear_pages(page_address(page), order);
+ t = get_cycles() - t1;
+ add_page_state(clear_pages_cycles, t);
+ add_page_state(clear_pages_order, 1 << order);
+ inc_page_state(clear_pages_nr);
+ return;
+ }
+#endif
+
+ t1=get_cycles();
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
+ add_page_state(clear_page_cycles, get_cycles() - t1);
+ add_page_state(clear_page_order, 1 << order);
+ inc_page_state(clear_page_nr);
}

/*
Index: linux-2.6.11/include/linux/page-flags.h
===================================================================
--- linux-2.6.11.orig/include/linux/page-flags.h 2005-03-16 17:12:51.000000000 -0800
+++ linux-2.6.11/include/linux/page-flags.h 2005-03-16 17:13:02.000000000 -0800
@@ -131,6 +131,13 @@ struct page_state {
unsigned long allocstall; /* direct reclaim calls */

unsigned long pgrotated; /* pages rotated to tail of the LRU */
+
+ unsigned long clear_page_nr; /* Nr of clear_page request */
+ unsigned long clear_page_cycles; /* Cycles spent in clear_page */
+ unsigned long clear_page_order; /* Sum of orders */
+ unsigned long clear_pages_nr; /* Nr of clear_pages requests */
+ unsigned long clear_pages_cycles; /* Nr of cycles in clear_pages */
+ unsigned long clear_pages_order; /* Sum of orders */
};

extern void get_page_state(struct page_state *ret);
Index: linux-2.6.11/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.11.orig/fs/proc/proc_misc.c 2005-03-16 17:12:50.000000000 -0800
+++ linux-2.6.11/fs/proc/proc_misc.c 2005-03-16 17:22:18.000000000 -0800
@@ -127,7 +127,7 @@ static int meminfo_read_proc(char *page,
unsigned long allowed;
struct vmalloc_info vmi;

- get_page_state(&ps);
+ get_full_page_state(&ps);
get_zone_counts(&active, &inactive, &free);

/*
@@ -168,7 +168,13 @@ static int meminfo_read_proc(char *page,
"PageTables: %8lu kB\n"
"VmallocTotal: %8lu kB\n"
"VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n",
+ "VmallocChunk: %8lu kB\n"
+ "ClearPage # %8lu\n"
+ "ClearPage Pgs %8lu\n"
+ "ClearPage Cyc %8lu\n"
+ "ClearPages # %8lu\n"
+ "ClearPages Pg %8lu\n"
+ "ClearPages Cy %8lu\n",
K(i.totalram),
K(i.freeram),
K(i.bufferram),
@@ -191,7 +197,13 @@ static int meminfo_read_proc(char *page,
K(ps.nr_page_table_pages),
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
- vmi.largest_chunk >> 10
+ vmi.largest_chunk >> 10,
+ ps.clear_page_nr,
+ ps.clear_page_order,
+ ps.clear_page_cycles,
+ ps.clear_pages_nr,
+ ps.clear_pages_order,
+ ps.clear_pages_cycles
);

len += hugetlb_report_meminfo(page + len);

2005-03-18 09:58:46

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thursday 17 March 2005 03:33, Christoph Lameter wrote:
> On Fri, 11 Mar 2005, Denis Vlasenko wrote:
>
> > Andi Kleen (iirc) says that non-temporal stores seem to be
> > big win in microbenchmarks (and I second that), but they are
> > a net loss when we are going to use zeroed page just after
> > zeroing. He recommends avoid using non-temporal stores
> >
> > With this new page prezeroing infrastructure, that argument
> > most likely is not right anymore. Especially clearing of
> > high-order pages definitely will benefit from NT stores
> > because they do not kill L1 data cache in the process.
> >
> > I don't have K8 and therefore cannot be 100% sure, but
> > I really doubt that K8 optimize "rep stosq" into _NT_ stores.
>
> Hmm. That would be interesting to know and may be necessary to justify
> the continued existence of this patch. I tried to get some numbers on
> the performance wins for zeroing larger pages with the patch as is (no
> NT stores) and came up with:
>
> Processor Performance Increase
> ----------------------------------------------------------------
> Itanium 2 1.3Ghz M1/R5 1.5%
> AMD Athlon 64 3200+ i386 mode 3%
> AMD Athlon 64 3200+ x86_64 mode 3.3%
>
> (this is if the zeroing engine is the cpu of course. Prezeroing
> may be done through some DMA gizmo independent of the cpu)
>
> Itanium has more extensive optimization capabilities and
> seems to be able to better cope with the loop logic for regular
> clear_page. Thus the improvement is even less on Itanium.
>
> Numbers obtained with the following patch that allows to get performance
> data from /proc/meminfo on zeroing performance (just divide Cycles by
> Pages for clear_page and clear_pages):

Here is a patch which allows to try different page zeroing
optimizations to be tested at runtime via sysctl.
Was run tested in 2.6.8 time. Rediffed to 2.6.11.
Feel free to adapt to your patch and test.

Also attached is a tarball for microbenchmarking routines. There are two
result files. Duron:

normal_clear_page - took 8644 max, 8400 min cycles per page
repstosl_clear_page - took 8626 max, 8418 min cycles per page
movq_clear_page - took 8647 max, 8300 min cycles per page
movntq_clear_page - took 2777 max, 2720 min cycles per page

And amd64:
normal_clear_page - took 9427 max, 5781 min cycles per page
repstosl_clear_page - took 9305 max, 5680 min cycles per page
movq_clear_page - took 6167 max, 5576 min cycles per page
movntq_clear_page - took 5456 max, 2354 min cycles per page

NT stores are not about 5% increase. 200%-300%. Provided you are ok with
the fact that zeroed page ends up evicted from cache. Luckily, this is exactly
what you want with prezeroing.
--
vda


Attachments:
(No filename) (2.75 kB)
x86_SSE_clear_page.2611.patch (18.44 kB)
page_asm.tar.bz2 (5.56 kB)
Download all attachments

2005-03-18 10:12:04

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

> Andi Kleen (iirc) says that non-temporal stores seem to be
> big win in microbenchmarks (and I second that), but they are
> a net loss when we are going to use zeroed page just after
> zeroing. He recommends avoid using non-temporal stores

The rule of thumb is to only use non temporal stores when your
data set is bigger than the L2/L3 caches of the CPU. This means >1MB.
The kernel normally never works on data sets that big.

For Christophers new background cleaner daemon it may be worth it
when the queue is a LILO. This means it is likely there is a relatively
long time between the clearing operation and a workload using it.
But even then it is a very close call and would need clear benchmark
numbers in macrobenchmarks.

-Andi

2005-03-18 15:01:03

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Fri, 18 Mar 2005, Denis Vlasenko wrote:

> NT stores are not about 5% increase. 200%-300%. Provided you are ok with
> the fact that zeroed page ends up evicted from cache. Luckily, this is exactly
> what you want with prezeroing.

These are pretty significant results. Maybe its best to use non-temporal
stores in general for clearing pages? I checked and Itanium has always
used non-temporal stores. So there will be no benefit for us from this
approach (we have 16k and 64k page sizes which may make the situation a
bit different). Try to update the i386 architectures to do the same?

Or for prezeroing, you could register a zeroing driver that would use the
non-temporal stores with V8 of the prezeroing patches. In any case the
clear_pages patch is not useful the way it was intended for us and I am
have dropped this from the prezeroing patch.

2005-03-18 19:28:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Fri, Mar 18, 2005 at 07:00:06AM -0800, Christoph Lameter wrote:
> On Fri, 18 Mar 2005, Denis Vlasenko wrote:
>
> > NT stores are not about 5% increase. 200%-300%. Provided you are ok with
> > the fact that zeroed page ends up evicted from cache. Luckily, this is exactly
> > what you want with prezeroing.
>
> These are pretty significant results. Maybe its best to use non-temporal

The differences are actually less. I do not know what Denis benchmarked,
but in my tests the difference was never more than ~10%. He got a zero
too much?

It does not make any sense if you think of it - the memory bus
of the CPU cannot be that much faster than the cache.

And the drawback of eating the cache misses later is really very
significant.

> stores in general for clearing pages? I checked and Itanium has always
> used non-temporal stores. So there will be no benefit for us from this

That is weird. I would actually try to switch to temporal stores, maybe
it will improve some benchmarks.

> approach (we have 16k and 64k page sizes which may make the situation a
> bit different). Try to update the i386 architectures to do the same?

Definitely not.

You can experiment with using it for the cleaner daemon, but even
there I would use some heuristic to make sure you only use it
on a page that are at the end of a pretty long queue.

e.g. if you can guarantee that the page allocator will go through
500k-1MB before going to the NT page that is cache cold it may
be a good idea. But that might be pretty complicated and I am not
sure it will be worth it.

But for the clear running in the page fault handler context it is
definitely a bad idea.

-Andi

2005-03-18 20:20:56

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Fri, 18 Mar 2005, Andi Kleen wrote:

> It does not make any sense if you think of it - the memory bus
> of the CPU cannot be that much faster than the cache.

The memory bus would be able to reach a higher rate if properly optimized
for sequential writes to memory. A cache typically does random writes.

2005-03-21 15:31:56

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Friday 18 March 2005 21:28, Andi Kleen wrote:
> On Fri, Mar 18, 2005 at 07:00:06AM -0800, Christoph Lameter wrote:
> > On Fri, 18 Mar 2005, Denis Vlasenko wrote:
> >
> > > NT stores are not about 5% increase. 200%-300%. Provided you are ok with
> > > the fact that zeroed page ends up evicted from cache. Luckily, this is exactly
> > > what you want with prezeroing.
> >
> > These are pretty significant results. Maybe its best to use non-temporal
>
> The differences are actually less. I do not know what Denis benchmarked,
> but in my tests the difference was never more than ~10%. He got a zero
> too much?

No. See attached.

# gcc -O2 0main.c
# ./a.out
Page clear/copy benchmark program.
buffer size: 1 Mb
Each test tried 64 times, max and min CPU cycles per page are reported.
Please disregard max values. They are due to system interference only.
clear_page() tests:
normal_clear_page - took 44214 max,12615 min cycles per page
normal_clear_page - took 18969 max,12649 min cycles per page
repstosl_clear_page - took 19897 max,12655 min cycles per page
movq_clear_page - took 39391 max,10782 min cycles per page
movntq_clear_page - took 21612 max, 4779 min cycles per page

copy_page() tests:
....

I'm basically saying that 'microbenchmark-visible'
performance of NT stores is 200-300% higher than 'normal' stores.

BTW: cache eviction is not an intrisic property of non-temporal
stores. It's merely how they're implemented in current CPUs:
if NT stores hit cached line, invalidate it and
push stores to bus. Else just push stores to bus
without reading cacheline from RAM first.

It is possible that some future CPU won't evict cacheline
if NT stores happened to hit it: "if NT stores hit cached line,
MODIFY it and push stores to bus".
--
vda


Attachments:
(No filename) (1.79 kB)
page_asm.tar.bz2 (5.57 kB)
Download all attachments

2005-03-24 18:40:15

by David Mosberger

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

>>>>> On Fri, 18 Mar 2005 20:28:08 +0100, Andi Kleen <[email protected]> said:

>> stores in general for clearing pages? I checked and Itanium has
>> always used non-temporal stores. So there will be no benefit for
>> us from this

Andi> That is weird. I would actually try to switch to temporal
Andi> stores, maybe it will improve some benchmarks.

That's definitely the case. See my earlier post on this topic:

http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html

Unfortunately, nobody reported any results for larger machines and/or
more interesting workloads, so the patch is in limbo at this time.
Clearly, if the CPU that's clearing the page is likely to use that
same page soon after, it'd be useful to use temporal stores.

--david

2005-03-24 18:45:30

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005, David Mosberger wrote:

> That's definitely the case. See my earlier post on this topic:
>
> http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html
>
> Unfortunately, nobody reported any results for larger machines and/or
> more interesting workloads, so the patch is in limbo at this time.
> Clearly, if the CPU that's clearing the page is likely to use that
> same page soon after, it'd be useful to use temporal stores.


So it would be useful to have

clear_page -> Temporal. Only zaps one page

and

clear_pages -> Zaps arbitrary order of page non-temporal


Rework the clear_pages patch to do just that? Maybe rename clear_pages
clear_pages_nt?

prep_zero_page would use a temporal clear for an order 0 page but a
nontemporal clear for higher order pages.

2005-03-24 19:10:27

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005 10:41:06 -0800 (PST)
Christoph Lameter <[email protected]> wrote:

> So it would be useful to have
>
> clear_page -> Temporal. Only zaps one page
>
> and
>
> clear_pages -> Zaps arbitrary order of page non-temporal
>
>
> Rework the clear_pages patch to do just that? Maybe rename clear_pages
> clear_pages_nt?
>
> prep_zero_page would use a temporal clear for an order 0 page but a
> nontemporal clear for higher order pages.

That sounds about right to me.

Hmmm, I'm inspired to experiment with this on sparc64 a bit.
:-)

2005-03-24 22:53:16

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005, David S. Miller wrote:

> > prep_zero_page would use a temporal clear for an order 0 page but a
> > nontemporal clear for higher order pages.
>
> That sounds about right to me.
>
> Hmmm, I'm inspired to experiment with this on sparc64 a bit.

Could you help me fix up this patch replacing the old clear_pages patch?

Introduces a new function clear_cold(void *pageaddress, int order) to clear
pages of an arbitrary size with non temporal stores. Cold clearing is typically
faster than hot clearing. Hot clearing is beneficial when the data is to be used soon.
(The hot cold distincion also work well with the new hot and cold aware prezeroing daemon)

- Use cold clearing for huge pages.
- For ia64 also make clear_page uses temporal stores.
- Patch needs fixes to work properly on i386, x86_64 and sparc64.
- There may be other allocations that can benefit from the increased
performance possible for cold zeroed pages if the pages are not to be
used right away. Add __GFP_COLD to the gfp_flags for those.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.11/mm/hugetlb.c
===================================================================
--- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800
+++ linux-2.6.11/mm/hugetlb.c 2005-03-24 14:12:53.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER | __GFP_COLD);
return page;
}

Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-24 13:15:40.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-24 14:15:15.000000000 -0800
@@ -633,11 +633,17 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, int order, int gfp_flags)
{
int i;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef CONFIG_CLEAR_COLD
+ if ((gfp_flags & __GFP_COLD) && !PageHighmem(page))
+ clear_cold(page_address(page), order)
+ else
+#endif
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.11/include/linux/gfp.h
===================================================================
--- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800
+++ linux-2.6.11/include/linux/gfp.h 2005-03-24 14:12:53.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order, unsigned int gfp_flags);
#endif /* __LINUX_GFP_H */
Index: linux-2.6.11/arch/ia64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800
+++ linux-2.6.11/arch/ia64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -46,6 +46,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
Index: linux-2.6.11/include/asm-ia64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -57,6 +57,8 @@
# define STRICT_MM_TYPECHECKS

extern void clear_page (void *page);
+/* Clear arbitrary order page using nontemporal writes */
+extern void clear_cold (void *page, unsigned int order);
extern void copy_page (void *to, void *from);

/*
Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-24 14:12:53.000000000 -0800
@@ -39,6 +39,7 @@ EXPORT_SYMBOL(__up);

#include <asm/page.h>
EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_cold);

#ifdef CONFIG_VIRTUAL_MEM_MAP
#include <linux/bootmem.h>
Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800
@@ -7,6 +7,8 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 24/3/04 clameter Make clear_page use temporal stores
+ add clear_cold using nontemporal stores
*/
#include <linux/config.h>

@@ -53,6 +55,58 @@ GLOBAL_ENTRY(clear_page)
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+#else
+ // Optimized for McKinley
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+ stf.spill [dst3] = f0, 64
+ stf.spill [dst4] = f0, 128
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+ stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+#endif
+ stf.spill [dst3] = f0, 64
+(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE
+ br.cloop.sptk.few 1b
+ ;;
+ mov ar.lc = saved_lc // restore lc
+ br.ret.sptk.many rp
+END(clear_page)
+
+
+GLOBAL_ENTRY(clear_cold)
+ .prologue
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
+ .save ar.lc, saved_lc
+ mov saved_lc = ar.lc
+ ;;
+ .body
+ adds dst1 = 16, in0
+ mov ar.lc = (PREFETCH_LINES - 1)
+ mov dst_fetch = in0
+ adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
+ ;;
+.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+ adds dst3 = 48, in0 // executing this multiple times is harmless
+ br.cloop.sptk.few .fetch
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
+ ;;
+ mov ar.lc = r16 // one L3 line per iteration
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
+ ;;
+#ifdef CONFIG_ITANIUM
+ // Optimized for Itanium
1: stf.spill.nta [dst1] = f0, 64
stf.spill.nta [dst2] = f0, 64
cmp.lt p8,p0=dst_fetch, dst_last
@@ -74,4 +128,4 @@ GLOBAL_ENTRY(clear_page)
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
-END(clear_page)
+END(clear_cold)
Index: linux-2.6.11/arch/i386/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-24 13:15:36.000000000 -0800
+++ linux-2.6.11/arch/i386/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -33,6 +33,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"

menu "Processor type and features"
Index: linux-2.6.11/include/asm-i386/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/include/asm-i386/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -19,6 +19,7 @@
#include <asm/mmx.h>

#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_cold(page, order) mmx_clear_cold((void *)(page), order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -29,6 +30,8 @@
*/

#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+/* Clear arbitrary order page with nontemporal stores... is memset temporal?? */
+#define clear_cold(page, order) memset((void *)(page), 0, PAGE_SIZE << order)
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif
Index: linux-2.6.11/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-24 14:12:53.000000000 -0800
@@ -9,6 +9,7 @@

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
extern void mmx_clear_page(void *page);
+extern void mmx_clear_cold(void *page, unsigned int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.11/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-24 14:12:53.000000000 -0800
@@ -397,3 +397,14 @@ void mmx_copy_page(void *to, void *from)
else
fast_copy_page(to, from);
}
+
+/* FIXME: Make this a real cold zeroing function */
+void mmx_clear_cold(void *page, int order)
+{
+ int i;
+
+ for(i=0; i < (1 << order); i++) {
+ mmx_clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
Index: linux-2.6.11/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-24 13:15:37.000000000 -0800
+++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -78,6 +78,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"


Index: linux-2.6.11/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -33,6 +33,8 @@
#ifndef __ASSEMBLY__

void clear_page(void *);
+/* Clear arbitrary order page using non-temporal writes */
+void clear_cold(void *, int order);
void copy_page(void *, void *);

#define clear_user_page(page, vaddr, pg) clear_page(page)
Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 13:15:37.000000000 -0800
+++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 14:12:53.000000000 -0800
@@ -108,6 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);

EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_cold);

EXPORT_SYMBOL(cpu_pda);
#ifdef CONFIG_SMP
Index: linux-2.6.11/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800
@@ -48,3 +48,57 @@ clear_page_c:
ret
clear_page_c_end:
.previous
+
+
+/*
+ * Zero a page cold.
+ * rdi page
+ * rsi order
+ */
+ .globl clear_cold
+ .p2align 4
+clear_cold:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
+ xorl %eax,%eax
+ .p2align 4
+.Lcloop:
+ decl %ecx
+#define PUTC(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUTC(1)
+ PUTC(2)
+ PUTC(3)
+ PUTC(4)
+ PUTC(5)
+ PUTC(6)
+ PUTC(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lcloop
+ nop
+ ret
+clear_cold_end:
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_cold
+ .quad clear_cold_c
+ .byte X86_FEATURE_K8_C
+ .byte clear_cold_end-clear_cold
+ .byte clear_cold_c_end-clear_cold_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+clear_cold_c:
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
+ xorl %eax,%eax
+ rep
+ stosq
+ ret
+clear_cold_c_end:
+ .previous
Index: linux-2.6.11/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800
@@ -103,3 +103,82 @@ clear_page_common:
out: retl
nop

+ .globl clear_cold
+clear_cold: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
+ ba,pt %xcc, clear_cold_common
+ sllx %o2, %o1, %o1
+
+ /* This thing is pretty important, it shows up
+ * on the profiles via do_anonymous_page().
+ */
+ .align 32
+ .globl clear_cold_page
+clear_cold_user_page: /* %o0=dest, %o1=vaddr */
+ lduw [%g6 + TI_PRE_COUNT], %o2
+ sethi %uhi(PAGE_OFFSET), %g2
+ sethi %hi(PAGE_SIZE), %o4
+
+ sllx %g2, 32, %g2
+ sethi %uhi(TTE_BITS_TOP), %g3
+
+ sllx %g3, 32, %g3
+ sub %o0, %g2, %g1 ! paddr
+
+ or %g3, TTE_BITS_BOTTOM, %g3
+ and %o1, %o4, %o0 ! vaddr D-cache alias bit
+
+ or %g1, %g3, %g1 ! TTE data
+ sethi %hi(TLBTEMP_BASE), %o3
+
+ add %o2, 1, %o4
+ add %o0, %o3, %o0 ! TTE vaddr
+
+ /* Disable preemption. */
+ mov TLB_TAG_ACCESS, %g3
+ stw %o4, [%g6 + TI_PRE_COUNT]
+
+ /* Load TLB entry. */
+ rdpr %pstate, %o4
+ wrpr %o4, PSTATE_IE, %pstate
+ stxa %o0, [%g3] ASI_DMMU
+ stxa %g1, [%g0] ASI_DTLB_DATA_IN
+ flush %g6
+ wrpr %o4, 0x0, %pstate
+
+ sethi %hi(PAGE_SIZE/64), %o1
+ mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1
+
+clear_cold_common:
+ VISEntryHalf
+ membar #StoreLoad | #StoreStore | #LoadStore
+ fzero %f0
+ mov %o0, %g1 ! remember vaddr for tlbflush
+ fzero %f2
+ faddd %f0, %f2, %f4
+ fmuld %f0, %f2, %f6
+ faddd %f0, %f2, %f8
+ fmuld %f0, %f2, %f10
+
+ faddd %f0, %f2, %f12
+ fmuld %f0, %f2, %f14
+2: stda %f0, [%o0 + %g0] ASI_BLK_P
+ subcc %o1, 1, %o1
+ bne,pt %icc, 2b
+ add %o0, 0x40, %o0
+ membar #Sync
+ VISExitHalf
+
+ brz,pn %o4, outcold
+ nop
+
+ stxa %g0, [%g1] ASI_DMMU_DEMAP
+ membar #Sync
+ stw %o2, [%g6 + TI_PRE_COUNT]
+
+outcold: retl
+ nop
+
Index: linux-2.6.11/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800
+++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -16,6 +16,8 @@

extern void _clear_page(void *page);
#define clear_page(X) _clear_page((void *)(X))
+/* Non temporal clear an arbitrary order page */
+extern void clear_cold(void *page, unsigned int order);
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.11/arch/sparc64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800
+++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -16,6 +16,10 @@ config TIME_INTERPOLATION
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"

config SYSVIPC_COMPAT

2005-03-24 23:20:53

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005 14:49:55 -0800 (PST)
Christoph Lameter <[email protected]> wrote:

> On Thu, 24 Mar 2005, David S. Miller wrote:
>
> > > prep_zero_page would use a temporal clear for an order 0 page but a
> > > nontemporal clear for higher order pages.
> >
> > That sounds about right to me.
> >
> > Hmmm, I'm inspired to experiment with this on sparc64 a bit.
>
> Could you help me fix up this patch replacing the old clear_pages patch?

Sure, I'll play with it.

Meanwhile, here are some numbers. I changed just the clear_page()
implementation on sparc64 so that it used prefetching and normal
temporal stores. The machine is a uniprocessor 1.5Ghz Ultra-IIIi,
64K write-through D-cache, 64K I-cache, 1MB L2 cache. I did 4
timed 'vmlinux' builds after a fresh boot:

BEFORE:
real 9m8.720s
user 8m28.345s
sys 0m32.734s

real 9m2.034s
user 8m28.763s
sys 0m32.512s

real 9m1.848s
user 8m28.970s
sys 0m32.204s

real 9m1.701s
user 8m28.715s
sys 0m32.394s

AFTER:
real 9m2.241s
user 8m16.633s
sys 0m36.451s

real 8m53.739s
user 8m17.165s
sys 0m36.052s

real 8m54.089s
user 8m17.266s
sys 0m36.219s

real 8m54.071s
user 8m17.473s
sys 0m36.073s

So, at the very least, my results agree with D. Mosberger's on IA64.

At the cost of ~4 seconds of system time, we gain ~11 seconds of
user time.

I'm pretty much convinced this is a win. I wonder if it matters to
do something similar for copy_page*() as well.

2005-03-25 02:36:43

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005 14:49:55 -0800 (PST)
Christoph Lameter <[email protected]> wrote:

> Could you help me fix up this patch replacing the old clear_pages patch?

Ok, first you need to mark the order and gfp arguments as unsigned
for mm/page_alloc.c:prep_zero_page() so that it matches the prototype
you added to include/linux/gfp.h else the compiler warns a lot.

Next, in the same function in mm/page_alloc.c, "PageHighmem()" is typo'd, it should be
"PageHighMem()".

The clear_cold() call on the next line needs a semicolon.

Erm... were any of your test builds done with the new CONFIG_CLEAR_COLD
option enabled? :-)

Next, replace your arch/sparc64/lib/clear_page.S diff with this one and
things would be working and we'll be using the proper temporal vs.
non-temporal stores on that platform.

===== arch/sparc64/lib/clear_page.S 1.1 vs edited =====
--- 1.1/arch/sparc64/lib/clear_page.S 2004-08-08 19:54:07 -07:00
+++ edited/arch/sparc64/lib/clear_page.S 2005-03-24 15:56:33 -08:00
@@ -72,26 +72,34 @@
mov 1, %o4

clear_page_common:
- VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
- fzero %f0
sethi %hi(PAGE_SIZE/64), %o1
mov %o0, %g1 ! remember vaddr for tlbflush
- fzero %f2
or %o1, %lo(PAGE_SIZE/64), %o1
- faddd %f0, %f2, %f4
- fmuld %f0, %f2, %f6
- faddd %f0, %f2, %f8
- fmuld %f0, %f2, %f10

- faddd %f0, %f2, %f12
- fmuld %f0, %f2, %f14
-1: stda %f0, [%o0 + %g0] ASI_BLK_P
+#define PREFETCH(x, y) prefetch x, y
+#define PREFETCH_CODE 2
+
+ PREFETCH([%o0 + 0x000], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x040], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x080], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x0c0], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x100], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x140], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x180], PREFETCH_CODE)
+1:
+ stx %g0, [%o0 + 0x00]
+ stx %g0, [%o0 + 0x08]
+ stx %g0, [%o0 + 0x10]
+ stx %g0, [%o0 + 0x18]
+ stx %g0, [%o0 + 0x20]
+ stx %g0, [%o0 + 0x28]
+ stx %g0, [%o0 + 0x30]
+ stx %g0, [%o0 + 0x38]
+ PREFETCH([%o0 + 0x1c0], PREFETCH_CODE)
subcc %o1, 1, %o1
bne,pt %icc, 1b
add %o0, 0x40, %o0
- membar #Sync
- VISExitHalf

brz,pn %o4, out
nop
@@ -101,5 +109,32 @@
stw %o2, [%g6 + TI_PRE_COUNT]

out: retl
+ nop
+
+ .globl clear_cold
+clear_cold: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
+ sllx %o2, %o1, %o1
+ VISEntryHalf
+ membar #StoreLoad | #StoreStore | #LoadStore
+ fzero %f0
+ fzero %f2
+ faddd %f0, %f2, %f4
+ fmuld %f0, %f2, %f6
+ faddd %f0, %f2, %f8
+ fmuld %f0, %f2, %f10
+
+ faddd %f0, %f2, %f12
+ fmuld %f0, %f2, %f14
+2: stda %f0, [%o0 + %g0] ASI_BLK_P
+ subcc %o1, 1, %o1
+ bne,pt %icc, 2b
+ add %o0, 0x40, %o0
+ membar #Sync
+ VISExitHalf
+
+ retl
nop


2005-03-25 02:50:23

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005, David S. Miller wrote:

> Erm... were any of your test builds done with the new CONFIG_CLEAR_COLD
> option enabled? :-)

These were all fixed but I failed to do a "quilt refresh" .... sigh... The
email issues are also fixed now .... sigh. What a day.

> Next, replace your arch/sparc64/lib/clear_page.S diff with this one and
> things would be working and we'll be using the proper temporal vs.
> non-temporal stores on that platform.

Thanks.

Here is the patch with your changes and a "quilt refresh" ;-)

---------------------------------------------------------------------
Introduces a new function clear_cold(void *pageaddress, int order) to clear
pages of an arbitrary size with non temporal stores. Cold clearing is typically
faster than hot clearing. Hot clearing is beneficial when the data is to be used soon.
(Will also work well with the new hot and cold aware prezeroing daemon)

Use cold clearing for huge pages.

For ia64 also make clear_page uses temporal stores.

Patch needs fixes to work properly on i386 and x86_64.

Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.11/mm/hugetlb.c
===================================================================
--- linux-2.6.11.orig/mm/hugetlb.c 2005-03-01 23:38:12.000000000 -0800
+++ linux-2.6.11/mm/hugetlb.c 2005-03-24 14:12:53.000000000 -0800
@@ -78,7 +78,6 @@ void free_huge_page(struct page *page)
struct page *alloc_huge_page(void)
{
struct page *page;
- int i;

spin_lock(&hugetlb_lock);
page = dequeue_huge_page();
@@ -89,8 +88,7 @@ struct page *alloc_huge_page(void)
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
page[1].mapping = (void *)free_huge_page;
- for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ prep_zero_page(page, HUGETLB_PAGE_ORDER, GFP_HIGHUSER | __GFP_COLD);
return page;
}

Index: linux-2.6.11/mm/page_alloc.c
===================================================================
--- linux-2.6.11.orig/mm/page_alloc.c 2005-03-24 13:15:40.000000000 -0800
+++ linux-2.6.11/mm/page_alloc.c 2005-03-24 18:39:22.000000000 -0800
@@ -633,11 +633,17 @@ void fastcall free_cold_page(struct page
free_hot_cold_page(page, 1);
}

-static inline void prep_zero_page(struct page *page, int order, int gfp_flags)
+void prep_zero_page(struct page *page, unsigned int order, int gfp_flags)
{
int i;

BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+
+#ifdef CONFIG_CLEAR_COLD
+ if ((gfp_flags & __GFP_COLD) && !PageHighMem(page))
+ clear_cold(page_address(page), order);
+ else
+#endif
for(i = 0; i < (1 << order); i++)
clear_highpage(page + i);
}
Index: linux-2.6.11/include/linux/gfp.h
===================================================================
--- linux-2.6.11.orig/include/linux/gfp.h 2005-03-01 23:37:50.000000000 -0800
+++ linux-2.6.11/include/linux/gfp.h 2005-03-24 14:16:44.000000000 -0800
@@ -131,4 +131,5 @@ extern void FASTCALL(free_cold_page(stru

void page_alloc_init(void);

+void prep_zero_page(struct page *, unsigned int order, int gfp_flags);
#endif /* __LINUX_GFP_H */
Index: linux-2.6.11/arch/ia64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/ia64/Kconfig 2005-03-01 23:38:26.000000000 -0800
+++ linux-2.6.11/arch/ia64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -46,6 +46,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
choice
prompt "System type"
default IA64_GENERIC
Index: linux-2.6.11/include/asm-ia64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -57,6 +57,8 @@
# define STRICT_MM_TYPECHECKS

extern void clear_page (void *page);
+/* Clear arbitrary order page using nontemporal writes */
+extern void clear_cold (void *page, unsigned int order);
extern void copy_page (void *to, void *from);

/*
Index: linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/kernel/ia64_ksyms.c 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/ia64/kernel/ia64_ksyms.c 2005-03-24 14:12:53.000000000 -0800
@@ -39,6 +39,7 @@ EXPORT_SYMBOL(__up);

#include <asm/page.h>
EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_cold);

#ifdef CONFIG_VIRTUAL_MEM_MAP
#include <linux/bootmem.h>
Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-24 14:24:29.000000000 -0800
@@ -7,6 +7,8 @@
* 1/06/01 davidm Tuned for Itanium.
* 2/12/02 kchen Tuned for both Itanium and McKinley
* 3/08/02 davidm Some more tweaking
+ * 24/3/04 clameter Make clear_page use temporal stores
+ add clear_cold using nontemporal stores
*/
#include <linux/config.h>

@@ -53,6 +55,59 @@ GLOBAL_ENTRY(clear_page)
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+#else
+ // Optimized for McKinley
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+ stf.spill [dst3] = f0, 64
+ stf.spill [dst4] = f0, 128
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+ stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+#endif
+ stf.spill [dst3] = f0, 64
+(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE
+ br.cloop.sptk.few 1b
+ ;;
+ mov ar.lc = saved_lc // restore lc
+ br.ret.sptk.many rp
+END(clear_page)
+
+#define totsize r14
+
+GLOBAL_ENTRY(clear_cold)
+ .prologue
+ .regstk 2,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE // main loop count
+ mov totsize = PAGE_SIZE
+ .save ar.lc, saved_lc
+ mov saved_lc = ar.lc
+ ;;
+ .body
+ adds dst1 = 16, in0
+ mov ar.lc = (PREFETCH_LINES - 1)
+ mov dst_fetch = in0
+ adds dst2 = 32, in0
+ shl r16 = r16, in1
+ shl totsize = totsize, in1
+ ;;
+.fetc: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+ adds dst3 = 48, in0 // executing this multiple times is harmless
+ br.cloop.sptk.few .fetc
+ add r16 = -1,r16
+ add dst_last = totsize, dst_fetch
+ adds dst4 = 64, in0
+ ;;
+ mov ar.lc = r16 // one L3 line per iteration
+ adds dst_last = -PREFETCH_LINES*L3_LINE_SIZE, dst_last
+ ;;
+#ifdef CONFIG_ITANIUM
+ // Optimized for Itanium
1: stf.spill.nta [dst1] = f0, 64
stf.spill.nta [dst2] = f0, 64
cmp.lt p8,p0=dst_fetch, dst_last
@@ -74,4 +129,4 @@ GLOBAL_ENTRY(clear_page)
;;
mov ar.lc = saved_lc // restore lc
br.ret.sptk.many rp
-END(clear_page)
+END(clear_cold)
Index: linux-2.6.11/arch/i386/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/i386/Kconfig 2005-03-24 13:15:36.000000000 -0800
+++ linux-2.6.11/arch/i386/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -33,6 +33,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"

menu "Processor type and features"
Index: linux-2.6.11/include/asm-i386/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/page.h 2005-03-01 23:37:49.000000000 -0800
+++ linux-2.6.11/include/asm-i386/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -19,6 +19,7 @@
#include <asm/mmx.h>

#define clear_page(page) mmx_clear_page((void *)(page))
+#define clear_cold(page, order) mmx_clear_cold((void *)(page), order)
#define copy_page(to,from) mmx_copy_page(to,from)

#else
@@ -29,6 +30,8 @@
*/

#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE)
+/* Clear arbitrary order page with nontemporal stores... is memset temporal?? */
+#define clear_cold(page, order) memset((void *)(page), 0, PAGE_SIZE << order)
#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE)

#endif
Index: linux-2.6.11/include/asm-i386/mmx.h
===================================================================
--- linux-2.6.11.orig/include/asm-i386/mmx.h 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/include/asm-i386/mmx.h 2005-03-24 14:12:53.000000000 -0800
@@ -9,6 +9,7 @@

extern void *_mmx_memcpy(void *to, const void *from, size_t size);
extern void mmx_clear_page(void *page);
+extern void mmx_clear_cold(void *page, unsigned int order);
extern void mmx_copy_page(void *to, void *from);

#endif
Index: linux-2.6.11/arch/i386/lib/mmx.c
===================================================================
--- linux-2.6.11.orig/arch/i386/lib/mmx.c 2005-03-01 23:38:09.000000000 -0800
+++ linux-2.6.11/arch/i386/lib/mmx.c 2005-03-24 14:12:53.000000000 -0800
@@ -397,3 +397,14 @@ void mmx_copy_page(void *to, void *from)
else
fast_copy_page(to, from);
}
+
+/* FIXME: Make this a real cold zeroing function */
+void mmx_clear_cold(void *page, int order)
+{
+ int i;
+
+ for(i=0; i < (1 << order); i++) {
+ mmx_clear_page(page);
+ page += PAGE_SIZE;
+ }
+}
Index: linux-2.6.11/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/x86_64/Kconfig 2005-03-24 13:15:37.000000000 -0800
+++ linux-2.6.11/arch/x86_64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -78,6 +78,10 @@ config GENERIC_IOMAP
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"


Index: linux-2.6.11/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-x86_64/page.h 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/include/asm-x86_64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -33,6 +33,8 @@
#ifndef __ASSEMBLY__

void clear_page(void *);
+/* Clear arbitrary order page using non-temporal writes */
+void clear_cold(void *, int order);
void copy_page(void *, void *);

#define clear_user_page(page, vaddr, pg) clear_page(page)
Index: linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.11.orig/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 13:15:37.000000000 -0800
+++ linux-2.6.11/arch/x86_64/kernel/x8664_ksyms.c 2005-03-24 14:12:53.000000000 -0800
@@ -108,6 +108,7 @@ EXPORT_SYMBOL(pci_mem_start);

EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
+EXPORT_SYMBOL(clear_cold);

EXPORT_SYMBOL(cpu_pda);
#ifdef CONFIG_SMP
Index: linux-2.6.11/arch/x86_64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/x86_64/lib/clear_page.S 2005-03-01 23:38:08.000000000 -0800
+++ linux-2.6.11/arch/x86_64/lib/clear_page.S 2005-03-24 14:12:53.000000000 -0800
@@ -48,3 +48,57 @@ clear_page_c:
ret
clear_page_c_end:
.previous
+
+
+/*
+ * Zero a page cold.
+ * rdi page
+ * rsi order
+ */
+ .globl clear_cold
+ .p2align 4
+clear_cold:
+ movl $4096/64,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
+ xorl %eax,%eax
+ .p2align 4
+.Lcloop:
+ decl %ecx
+#define PUTC(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUTC(1)
+ PUTC(2)
+ PUTC(3)
+ PUTC(4)
+ PUTC(5)
+ PUTC(6)
+ PUTC(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lcloop
+ nop
+ ret
+clear_cold_end:
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_cold
+ .quad clear_cold_c
+ .byte X86_FEATURE_K8_C
+ .byte clear_cold_end-clear_cold
+ .byte clear_cold_c_end-clear_cold_c
+ .previous
+
+ .section .altinstr_replacement,"ax"
+clear_cold_c:
+ movl $4096/8,%eax
+ movl %esi, %ecx
+ shll %cl, %eax
+ movl %eax, %ecx
+ xorl %eax,%eax
+ rep
+ stosq
+ ret
+clear_cold_c_end:
+ .previous
Index: linux-2.6.11/arch/sparc64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/sparc64/lib/clear_page.S 2005-03-01 23:38:17.000000000 -0800
+++ linux-2.6.11/arch/sparc64/lib/clear_page.S 2005-03-24 18:39:44.000000000 -0800
@@ -72,13 +72,127 @@ clear_user_page: /* %o0=dest, %o1=vaddr
mov 1, %o4

clear_page_common:
+ membar #StoreLoad | #StoreStore | #LoadStore
+ sethi %hi(PAGE_SIZE/64), %o1
+ mov %o0, %g1 ! remember vaddr for tlbflush
+ or %o1, %lo(PAGE_SIZE/64), %o1
+
+#define PREFETCH(x, y) prefetch x, y
+#define PREFETCH_CODE 2
+
+ PREFETCH([%o0 + 0x000], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x040], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x080], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x0c0], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x100], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x140], PREFETCH_CODE)
+ PREFETCH([%o0 + 0x180], PREFETCH_CODE)
+1:
+ stx %g0, [%o0 + 0x00]
+ stx %g0, [%o0 + 0x08]
+ stx %g0, [%o0 + 0x10]
+ stx %g0, [%o0 + 0x18]
+ stx %g0, [%o0 + 0x20]
+ stx %g0, [%o0 + 0x28]
+ stx %g0, [%o0 + 0x30]
+ stx %g0, [%o0 + 0x38]
+ PREFETCH([%o0 + 0x1c0], PREFETCH_CODE)
+ subcc %o1, 1, %o1
+ bne,pt %icc, 1b
+ add %o0, 0x40, %o0
+
+ brz,pn %o4, out
+ nop
+
+ stxa %g0, [%g1] ASI_DMMU_DEMAP
+ membar #Sync
+ stw %o2, [%g6 + TI_PRE_COUNT]
+
+out: retl
+ nop
+
+ .globl clear_cold
+clear_cold: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
+ sllx %o2, %o1, %o1
VISEntryHalf
membar #StoreLoad | #StoreStore | #LoadStore
fzero %f0
+ fzero %f2
+ faddd %f0, %f2, %f4
+ fmuld %f0, %f2, %f6
+ faddd %f0, %f2, %f8
+ fmuld %f0, %f2, %f10
+
+ faddd %f0, %f2, %f12
+ fmuld %f0, %f2, %f14
+2: stda %f0, [%o0 + %g0] ASI_BLK_P
+ subcc %o1, 1, %o1
+ bne,pt %icc, 2b
+ add %o0, 0x40, %o0
+ membar #Sync
+ VISExitHalf
+
+ retl
+ nop
+
+ .globl clear_cold
+clear_cold: /* %o0=dest, %o1=order */
+ sethi %hi(PAGE_SIZE/64), %o2
+ clr %o4
+ or %o2, %lo(PAGE_SIZE/64), %o2
+ ba,pt %xcc, clear_cold_common
+ sllx %o2, %o1, %o1
+
+ /* This thing is pretty important, it shows up
+ * on the profiles via do_anonymous_page().
+ */
+ .align 32
+ .globl clear_cold_page
+clear_cold_user_page: /* %o0=dest, %o1=vaddr */
+ lduw [%g6 + TI_PRE_COUNT], %o2
+ sethi %uhi(PAGE_OFFSET), %g2
+ sethi %hi(PAGE_SIZE), %o4
+
+ sllx %g2, 32, %g2
+ sethi %uhi(TTE_BITS_TOP), %g3
+
+ sllx %g3, 32, %g3
+ sub %o0, %g2, %g1 ! paddr
+
+ or %g3, TTE_BITS_BOTTOM, %g3
+ and %o1, %o4, %o0 ! vaddr D-cache alias bit
+
+ or %g1, %g3, %g1 ! TTE data
+ sethi %hi(TLBTEMP_BASE), %o3
+
+ add %o2, 1, %o4
+ add %o0, %o3, %o0 ! TTE vaddr
+
+ /* Disable preemption. */
+ mov TLB_TAG_ACCESS, %g3
+ stw %o4, [%g6 + TI_PRE_COUNT]
+
+ /* Load TLB entry. */
+ rdpr %pstate, %o4
+ wrpr %o4, PSTATE_IE, %pstate
+ stxa %o0, [%g3] ASI_DMMU
+ stxa %g1, [%g0] ASI_DTLB_DATA_IN
+ flush %g6
+ wrpr %o4, 0x0, %pstate
+
sethi %hi(PAGE_SIZE/64), %o1
+ mov 1, %o4
+ or %o1, %lo(PAGE_SIZE/64), %o1
+
+clear_cold_common:
+ VISEntryHalf
+ membar #StoreLoad | #StoreStore | #LoadStore
+ fzero %f0
mov %o0, %g1 ! remember vaddr for tlbflush
fzero %f2
- or %o1, %lo(PAGE_SIZE/64), %o1
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
faddd %f0, %f2, %f8
@@ -86,20 +200,20 @@ clear_page_common:

faddd %f0, %f2, %f12
fmuld %f0, %f2, %f14
-1: stda %f0, [%o0 + %g0] ASI_BLK_P
+2: stda %f0, [%o0 + %g0] ASI_BLK_P
subcc %o1, 1, %o1
- bne,pt %icc, 1b
+ bne,pt %icc, 2b
add %o0, 0x40, %o0
membar #Sync
VISExitHalf

- brz,pn %o4, out
+ brz,pn %o4, outcold
nop

stxa %g0, [%g1] ASI_DMMU_DEMAP
membar #Sync
stw %o2, [%g6 + TI_PRE_COUNT]

-out: retl
+outcold: retl
nop

Index: linux-2.6.11/include/asm-sparc64/page.h
===================================================================
--- linux-2.6.11.orig/include/asm-sparc64/page.h 2005-03-01 23:38:07.000000000 -0800
+++ linux-2.6.11/include/asm-sparc64/page.h 2005-03-24 14:12:53.000000000 -0800
@@ -16,6 +16,8 @@

extern void _clear_page(void *page);
#define clear_page(X) _clear_page((void *)(X))
+/* Non temporal clear an arbitrary order page */
+extern void clear_cold(void *page, unsigned int order);
struct page;
extern void clear_user_page(void *addr, unsigned long vaddr, struct page *page);
#define copy_page(X,Y) memcpy((void *)(X), (void *)(Y), PAGE_SIZE)
Index: linux-2.6.11/arch/sparc64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/sparc64/Kconfig 2005-03-01 23:38:25.000000000 -0800
+++ linux-2.6.11/arch/sparc64/Kconfig 2005-03-24 14:12:53.000000000 -0800
@@ -16,6 +16,10 @@ config TIME_INTERPOLATION
bool
default y

+config CLEAR_COLD
+ bool
+ default y
+
source "init/Kconfig"

config SYSVIPC_COMPAT

2005-03-27 17:12:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

> Clearly, if the CPU that's clearing the page is likely to use that
> same page soon after, it'd be useful to use temporal stores.

That is always the case in the current code (without Christophers
pre cleaning daemon). The page fault handler clears and user space
is guaranteed to need at least one cacheline from the fresh page
because it just did a page fault on it. With non temporal stores
you guarantee at least one hard cache miss directly after
the return to user space.

I suspect even with precleaning the average time from cleaning to use will be
quite short.

-Andi

2005-03-27 18:32:16

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On 27 Mar 2005 19:12:20 +0200
Andi Kleen <[email protected]> wrote:

> With non temporal stores
> you guarantee at least one hard cache miss directly after
> the return to user space.

This is true if the cacheline were not present already at
the time of the non-temporal store.

I know what you're trying to say, I'm just clarifying.

The real question is if a large enough ratio of those
cachelines in the page get similarly accessed. I happen
to think the answer to that for any real example is yes.
Yet, I have no way to prove this.

It would be cool to do some hacks under Xen or user-mode
Linux to get some real statistics about this. Actually,
this could be done also with hacks to valgrind or other
similar tools. QEMU could also be used.

2005-03-29 01:59:52

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Sun, 27 Mar 2005, Andi Kleen wrote:

> > Clearly, if the CPU that's clearing the page is likely to use that
> > same page soon after, it'd be useful to use temporal stores.
>
> That is always the case in the current code (without Christophers
> pre cleaning daemon). The page fault handler clears and user space
> is guaranteed to need at least one cacheline from the fresh page
> because it just did a page fault on it. With non temporal stores
> you guarantee at least one hard cache miss directly after
> the return to user space.

It is not the case that *all* the cachelines of a page are going to be
used right after zeroing. For the page fault case it is only guaranteed that
*one* cacheline will be used. In the PTE/PMD/PUD page allocation cases it
is likely that only a single cacheline is used.

There are some cases in the code (apart from the fault handler)
where zeroed pages are allocated with no guarantee of use (f.e. the
allocations for buffers for shared memory or pipes).

> I suspect even with precleaning the average time from cleaning to use will be
> quite short.

If the time is short then hot cleaning is the right way to go and then
prezeroing is of no benefit. Prezeroing can only be of benefit if there is
sufficient time between the zeroing and the use of the data. It must be
sufficiently long to cause the the cachelines to no longer be in
in the caches. Then the loading of these cachelines may be avoided which
yields the performance benefit.

2005-04-06 00:21:31

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Thu, 24 Mar 2005, David Mosberger wrote:

> That's definitely the case. See my earlier post on this topic:
>
> http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html
>
> Unfortunately, nobody reported any results for larger machines and/or
> more interesting workloads, so the patch is in limbo at this time.
> Clearly, if the CPU that's clearing the page is likely to use that
> same page soon after, it'd be useful to use temporal stores.

Here are some numbers using lmbench of temporal writes vs. non temporal
writes on ia64 (8p machine but lmbench run only for one load). There seems
to be some benefit for fork/exec but overall this does not seem to be a
clear win. I suspect that the distinction between temporal vs. nontemporal
writes is be more beneficial on machines with smaller pagesizes since
the likelyhood that most cachelines of a page are used soon is increased
and therefore hot zeroing is more beneficial.


L M B E N C H 3 . 0 S U M M A R Y
------------------------------------
(Alpha software, do not distribute)

Basic system parameters
-------------------------------------------------------------------------------------------
Host OS Description Mhz tlb cache mem scal
pages line par load
bytes
--------- ------------------------- ----------------------- ---- ----- ----- ------ ----
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3 ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1
margin Linux 2.6.12-rc1-bk3-dm ia64-linux-gnu 1300 128 1

Processor, Processes - times in microseconds - smaller is better
------------------------------------------------------------------------------------------
Host OS Mhz null null open slct sig sig fork exec sh
call I/O stat clos TCP inst hndl proc proc proc
--------- ------------------------- ---- ---- ---- ---- ---- ---- ---- ---- ---- ---- ----
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.90 6.11 15.7 0.39 2.43 528. 1926 4853
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.27 4.86 6.10 15.7 0.39 2.45 522. 1910 4260
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.85 6.10 15.8 0.39 2.40 526. 1916 4429
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.84 6.11 15.7 0.39 2.40 531. 1838 4429
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 4.85 6.11 15.8 0.39 2.47 553. 1931 5118
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 5.09 6.37 15.7 0.39 2.40 537. 1934 5133
margin Linux 2.6.12-rc1-bk3 1300 0.04 0.26 5.09 6.35 15.8 0.39 2.40 555. 1939 5389
margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.26 4.88 6.10 15.8 0.39 2.42 519. 1829 4787
margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.26 4.87 6.09 15.8 0.39 2.40 516. 1830 5057
margin Linux 2.6.12-rc1-bk3-dm 1300 0.04 0.27 4.86 6.10 15.8 0.39 2.40 512. 1878 5166

Context switching - times in microseconds - smaller is better
-------------------------------------------------------------------------------------
Host OS 2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw ctxsw
--------- ------------------------- ------ ------ ------ ------ ------ ------- -------
margin Linux 2.6.12-rc1-bk3 7.3300 2.7400 7.0400 4.4600 6.6200 3.94000 8.38000
margin Linux 2.6.12-rc1-bk3 7.6100 8.1000 7.3200 4.5900 7.1700 5.50000 7.84000
margin Linux 2.6.12-rc1-bk3 7.2400 8.0000 7.2100 4.3800 6.7500 4.77000 7.37000
margin Linux 2.6.12-rc1-bk3 7.4100 8.0400 7.0500 4.5100 7.2500 4.11000 7.03000
margin Linux 2.6.12-rc1-bk3 7.2600 8.2100 7.2400 4.6500 6.6500 4.08000 7.81000
margin Linux 2.6.12-rc1-bk3 7.4600 7.9000 7.3800 4.3800 6.6200 4.83000 7.27000
margin Linux 2.6.12-rc1-bk3 7.4400 8.2000 7.2000 5.8700 6.8000 4.86000 7.95000
margin Linux 2.6.12-rc1-bk3-dm 7.4400 8.3100 7.1300 5.6900 6.6500 5.49000 7.49000
margin Linux 2.6.12-rc1-bk3-dm 2.1300 8.0100 7.3800 4.6700 6.5500 4.22000 8.16000
margin Linux 2.6.12-rc1-bk3-dm 7.4900 8.1200 2.1500 4.3600 6.6900 5.54000 7.38000

*Local* Communication latencies in microseconds - smaller is better
---------------------------------------------------------------------------------
Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP
ctxsw UNIX UDP TCP conn
--------- ------------------------- ----- ----- ---- ----- ----- ----- ----- ----
margin Linux 2.6.12-rc1-bk3 7.330 16.9 24.8 29.6 36.0 31.4 49.5 52.
margin Linux 2.6.12-rc1-bk3 7.610 17.4 22.0 31.5 52.
margin Linux 2.6.12-rc1-bk3 7.240 17.5 21.6 31.3 53.
margin Linux 2.6.12-rc1-bk3 7.410 17.6 11.8 31.2 51.
margin Linux 2.6.12-rc1-bk3 7.260 17.1 20.6 28.2 37.6 51.0 99.7 92.
margin Linux 2.6.12-rc1-bk3 7.460 17.0 21.0 30.2 69.5 35.3 77.4 52.
margin Linux 2.6.12-rc1-bk3 7.440 39.7 19.8 29.1 65.3 34.3 44.8 53.
margin Linux 2.6.12-rc1-bk3-dm 7.440 17.4 20.5 29.4 37.0 34.3 86.7 77.
margin Linux 2.6.12-rc1-bk3-dm 2.130 17.8 20.6 28.7 37.2 31.8 44.9 77.
margin Linux 2.6.12-rc1-bk3-dm 7.490 17.5 11.3 29.0 37.4 77.1 46.1 53.

File & VM system latencies in microseconds - smaller is better
-------------------------------------------------------------------------------------------
Host OS 0K File 10K File Mmap Prot Page 100fd
Create Delete Create Delete Latency Fault Fault selct
--------- ------------------------- ------ ------ ------ ------ ------- ----- ------- -----
margin Linux 2.6.12-rc1-bk3 340.0 0.162 1.26430 10.6
margin Linux 2.6.12-rc1-bk3 339.0 0.176 1.26310 10.5
margin Linux 2.6.12-rc1-bk3 342.0 0.180 1.25700 10.5
margin Linux 2.6.12-rc1-bk3 341.0 0.207 1.25640 10.5
margin Linux 2.6.12-rc1-bk3 339.0 0.166 1.26310 10.6
margin Linux 2.6.12-rc1-bk3 343.0 0.159 1.26350 10.6
margin Linux 2.6.12-rc1-bk3 339.0 0.174 1.25660 10.6
margin Linux 2.6.12-rc1-bk3-dm 340.0 0.185 1.26090 10.6
margin Linux 2.6.12-rc1-bk3-dm 340.0 0.128 1.26310 10.5
margin Linux 2.6.12-rc1-bk3-dm 343.0 0.159 1.25960 10.5

*Local* Communication bandwidths in MB/s - bigger is better
-----------------------------------------------------------------------------------------
Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem
UNIX reread reread (libc) (hand) read write
--------- ------------------------- ---- ---- ---- ------ ------ ------ ------ ---- -----
margin Linux 2.6.12-rc1-bk3 1172 1826 562. 1732.6 573.5 535.7 284.9 521. 514.7
margin Linux 2.6.12-rc1-bk3 1169 1883 868. 1733.5 573.8 535.2 283.9 521. 514.6
margin Linux 2.6.12-rc1-bk3 1149 1897 654. 1725.5 573.6 535.1 285.2 521. 514.7
margin Linux 2.6.12-rc1-bk3 1167 1883 921. 1726.1 573.8 534.9 283.1 521. 514.7
margin Linux 2.6.12-rc1-bk3 1167 1146 413. 1726.8 573.6 535.4 283.6 522. 515.1
margin Linux 2.6.12-rc1-bk3 1156 1875 905. 1721.7 573.9 535.4 283.8 521. 515.0
margin Linux 2.6.12-rc1-bk3 1103 1741 493. 1727.7 573.6 534.8 283.3 521. 514.8
margin Linux 2.6.12-rc1-bk3-dm 1160 1361 886. 1718.7 573.6 535.0 284.7 521. 514.8
margin Linux 2.6.12-rc1-bk3-dm 1166 1759 665. 1733.0 565.0 535.2 284.6 521. 514.8
margin Linux 2.6.12-rc1-bk3-dm 1140 1879 606. 1706.6 573.6 535.1 283.5 521. 514.6

patch:

Index: linux-2.6.11/arch/ia64/lib/clear_page.S
===================================================================
--- linux-2.6.11.orig/arch/ia64/lib/clear_page.S 2005-03-01 23:37:47.000000000 -0800
+++ linux-2.6.11/arch/ia64/lib/clear_page.S 2005-03-31 14:25:17.000000000 -0800
@@ -43,7 +43,7 @@ GLOBAL_ENTRY(clear_page)
adds dst1 = 16, in0
adds dst2 = 32, in0
;;
-.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+.fetch: stf.spill [dst_fetch] = f0, L3_LINE_SIZE
adds dst3 = 48, in0 // executing this multiple times is harmless
br.cloop.sptk.few .fetch
;;
@@ -53,23 +53,23 @@ GLOBAL_ENTRY(clear_page)
;;
#ifdef CONFIG_ITANIUM
// Optimized for Itanium
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
cmp.lt p8,p0=dst_fetch, dst_last
;;
#else
// Optimized for McKinley
-1: stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
- stf.spill.nta [dst3] = f0, 64
- stf.spill.nta [dst4] = f0, 128
+1: stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
+ stf.spill [dst3] = f0, 64
+ stf.spill [dst4] = f0, 128
cmp.lt p8,p0=dst_fetch, dst_last
;;
- stf.spill.nta [dst1] = f0, 64
- stf.spill.nta [dst2] = f0, 64
+ stf.spill [dst1] = f0, 64
+ stf.spill [dst2] = f0, 64
#endif
- stf.spill.nta [dst3] = f0, 64
-(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+ stf.spill [dst3] = f0, 64
+(p8) stf.spill [dst_fetch] = f0, L3_LINE_SIZE
br.cloop.sptk.few 1b
;;
mov ar.lc = saved_lc // restore lc

2005-04-06 00:24:13

by David Mosberger

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

>>>>> On Tue, 5 Apr 2005 17:15:53 -0700 (PDT), Christoph Lameter <[email protected]> said:

Christoph> On Thu, 24 Mar 2005, David Mosberger wrote:
>> That's definitely the case. See my earlier post on this topic:

>> http://www.gelato.unsw.edu.au/linux-ia64/0409/11012.html

>> Unfortunately, nobody reported any results for larger machines
>> and/or more interesting workloads, so the patch is in limbo at
>> this time. Clearly, if the CPU that's clearing the page is
>> likely to use that same page soon after, it'd be useful to use
>> temporal stores.

Christoph> Here are some numbers using lmbench of temporal writes
Christoph> vs. non temporal writes on ia64 (8p machine but lmbench
Christoph> run only for one load). There seems to be some benefit
Christoph> for fork/exec but overall this does not seem to be a
Christoph> clear win. I suspect that the distinction between
Christoph> temporal vs. nontemporal writes is be more beneficial on
Christoph> machines with smaller pagesizes since the likelyhood that
Christoph> most cachelines of a page are used soon is increased and
Christoph> therefore hot zeroing is more beneficial.

What LMbench test other than fork/exec would you have expected to be
affected by this? LMbench is not a good benchmark for this (remember:
it's a _micro_ benchmark).

--david

2005-04-06 00:36:54

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Tue, 5 Apr 2005, David Mosberger wrote:

> What LMbench test other than fork/exec would you have expected to be
> affected by this? LMbench is not a good benchmark for this (remember:
> it's a _micro_ benchmark).

LMbench does a variety of things and I expected to see at least
something on the page fault test and hopefully also some variations for
other tests.

Which benchmark would you recommend for this?

2005-04-06 04:49:12

by David Mosberger

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

>>>>> On Tue, 5 Apr 2005 17:33:59 -0700 (PDT), Christoph Lameter <[email protected]> said:

Christoph> Which benchmark would you recommend for this?

I don't know about "recommend", but I think SPECweb, SPECjbb,
the-UNIX-multi-user-benchmark-whose-name-I-keep-forgetting, and in
general anything that involves process-activity and/or large working
sets might be interesting (in other words: anything but
microbenchmarks; I'm afraid).

--david

2005-04-06 05:15:29

by Gerrit Huizenga

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order


On Tue, 05 Apr 2005 21:48:22 PDT, David Mosberger wrote:
> >>>>> On Tue, 5 Apr 2005 17:33:59 -0700 (PDT), Christoph Lameter <[email protected]> said:
>
> Christoph> Which benchmark would you recommend for this?
>
> I don't know about "recommend", but I think SPECweb, SPECjbb,
> the-UNIX-multi-user-benchmark-whose-name-I-keep-forgetting, and in
> general anything that involves process-activity and/or large working
> sets might be interesting (in other words: anything but
> microbenchmarks; I'm afraid).

SpecSDET, Aim7 or ReAim from OSDL are probably what you are thinking
of.

gerrit

2005-04-06 16:03:48

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] add a clear_pages function to clear pages of higher order

On Tue, Apr 05, 2005 at 10:15:18PM -0700, Gerrit Huizenga wrote:
> SpecSDET, Aim7 or ReAim from OSDL are probably what you are thinking of.

SDET isn't publicly available.
I hope by now osdl-reaim is called "osdl-aim7":
http://lkml.org/lkml/2003/8/1/172

grant