2002-10-24 17:09:39

by Manfred Spraul

[permalink] [raw]
Subject: [CFT] faster athlon/duron memory copy implementation

/*

(C) 2000 Arjan van de Ven and others licensed under the terms of the GPL


$Revision: 1.6 $
*/

static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $";
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* The 2.4 kernel one, adapted for userspace */

static void fast_clear_page(void *page)
{
int i;
char fpu_save[108];

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/128;i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
" movq %%mm0, 16(%0)\n"
" movq %%mm0, 24(%0)\n"
" movq %%mm0, 32(%0)\n"
" movq %%mm0, 40(%0)\n"
" movq %%mm0, 48(%0)\n"
" movq %%mm0, 56(%0)\n"
" movq %%mm0, 64(%0)\n"
" movq %%mm0, 72(%0)\n"
" movq %%mm0, 80(%0)\n"
" movq %%mm0, 88(%0)\n"
" movq %%mm0, 96(%0)\n"
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
: : "r" (page) : "memory");
page+=128;
}
__asm__ __volatile__ (
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* modified version for Athlon-family processors */
static void faster_clear_page(void *page)
{
int i;
char fpu_save[108];

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/64;i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
" movntq %%mm0, 16(%0)\n"
" movntq %%mm0, 24(%0)\n"
" movntq %%mm0, 32(%0)\n"
" movntq %%mm0, 40(%0)\n"
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
page+=64;
}
__asm__ __volatile__ (
" sfence \n "
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
* but serves as my playground.
*/
static void even_faster_clear_page(void *page)
{
int i;
char fpu_save[108];
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/64;i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
" movntq %%mm0, 16(%0)\n"
" movntq %%mm0, 24(%0)\n"
" movntq %%mm0, 32(%0)\n"
" movntq %%mm0, 40(%0)\n"
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
page+=64;
}
__asm__ __volatile__ (
" sfence \n "
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* The "fallback" one as used by the kernel */
static void slow_zero_page(void * page)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
:"a" (0),"1" (page),"0" (1024)
:"memory");
}

static void slow_copy_page(void *to, void *from)
{
int d0, d1, d2;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; movsl" \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (1024),"1" ((long) to),"2" ((long) from) \
: "memory");
}


/* 2.4 kernel mmx copy_page function */
static void fast_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
: : "r" (from) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms\n" : :
);
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

}


/* Athlon improved version */
static void faster_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];

__asm__ __volatile__ (
"1: prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
: : "r" (from) );

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
"1: prefetchnta 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq 32(%0), %%mm4\n"
" movq 40(%0), %%mm5\n"
" movq 48(%0), %%mm6\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm0, (%1)\n"
" movntq %%mm1, 8(%1)\n"
" movntq %%mm2, 16(%1)\n"
" movntq %%mm3, 24(%1)\n"
" movntq %%mm4, 32(%1)\n"
" movntq %%mm5, 40(%1)\n"
" movntq %%mm6, 48(%1)\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms \n "
" sfence\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
* but serves as my playground.
*/
static void even_faster_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];

__asm__ __volatile__ (
"1: prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
: : "r" (from) );

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
" prefetchnta 256(%0)\n"
" movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms \n "
" sfence\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}


/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
return to;
case 1:
*(unsigned char *)to = *(const unsigned char *)from;
return to;
case 2:
*(unsigned short *)to = *(const unsigned short *)from;
return to;
case 3:
*(unsigned short *)to = *(const unsigned short *)from;
*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
return to;
case 4:
*(unsigned long *)to = *(const unsigned long *)from;
return to;
case 6: /* for Ethernet addresses */
*(unsigned long *)to = *(const unsigned long *)from;
*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
return to;
case 8:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
return to;
case 12:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
return to;
case 16:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
return to;
case 20:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
return to;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}

#undef COMMON
}


static void normal_copy_page(void *to, void *from)
{
__constant_memcpy(to,from,4096);
}


/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
return s;
case 1:
*(unsigned char *)s = pattern;
return s;
case 2:
*(unsigned short *)s = pattern;
return s;
case 3:
*(unsigned short *)s = pattern;
*(2+(unsigned char *)s) = pattern;
return s;
case 4:
*(unsigned long *)s = pattern;
return s;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; stosl" \
x \
: "=&c" (d0), "=&D" (d1) \
: "a" (pattern),"0" (count/4),"1" ((long) s) \
: "memory")
{
int d0, d1;
switch (count % 4) {
case 0: COMMON(""); return s;
case 1: COMMON("\n\tstosb"); return s;
case 2: COMMON("\n\tstosw"); return s;
default: COMMON("\n\tstosw\n\tstosb"); return s;
}
}

#undef COMMON
}

static void normal_clear_page(void *to)
{
__constant_c_and_count_memset(to,0,4096);
}

/* test version to see if we can go even faster */
static void no_prefetch_copy_page(void *to, void *from) {
int i, d1;
char fpu_save[108];

for (i=4096-256;i>=0;i-=256)
__asm__ __volatile(
"movl 192(%1,%2),%0\n"
"movl 128(%1,%2),%0\n"
"movl 64(%1,%2),%0\n"
"movl 0(%1,%2),%0\n"
: "=&r" (d1)
: "r" (from), "r" (i));

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" sfence \n "
" emms\n"
" frstor %0;\n" ::"m"(fpu_save[0]) );
}


#define rdtsc(low,high) \
__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

typedef void (clear_func)(void *);
typedef void (copy_func)(void *,void *);

void test_one_clearpage(clear_func *func, char *name, char *Buffer)
{
char *temp;
int i;
unsigned int blow,bhigh,alow,ahigh;
unsigned long long before,after;

rdtsc(blow,bhigh);
temp = Buffer;
for (i=0;i<4*1024;i++) {
func(temp);
temp += 4096;
}
rdtsc(alow,ahigh);
before = blow + (((long long)bhigh)<<32);
after = alow +(((long long)ahigh)<<32);
if (before>after) {
printf("test invalid; timer overflow \n");
return;
}
printf("clear_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(4*1024) );


}

void test_one_copypage(copy_func *func, char *name, char *Buffer)
{
char *temp;
int i;
unsigned int blow,bhigh,alow,ahigh;
unsigned long long before,after;

sleep(1);
rdtsc(blow,bhigh);
temp = Buffer;
for (i=0;i<2*1024;i++) {
func(temp,temp+8*1024*1024);
temp += 4096;
}
rdtsc(alow,ahigh);
before = blow+ (((long long)bhigh)<<32);
after = alow+(((long long)ahigh)<<32);
if (before>after) {
printf("test invalid; timer overflow \n");
return;
}
printf("copy_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(2*1024) );


}


void test_clearpage(char *Buffer)
{
printf("clear_page() tests \n");

test_one_clearpage(fast_clear_page,"warm up run",Buffer);
test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
}

void test_copypage(char *Buffer)
{
printf("copy_page() tests \n");

test_one_copypage(fast_copy_page, "warm up run",Buffer);
test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
test_one_copypage(slow_copy_page, "2.4 MMX fallback",Buffer);
test_one_copypage(fast_copy_page, "2.4 MMX version",Buffer);
test_one_copypage(faster_copy_page,"faster_copy",Buffer);
test_one_copypage(even_faster_copy_page,"even_faster",Buffer);
test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
}

int main()
{
char *Buffer;

Buffer = malloc(1024*1024*16);
memset(Buffer,0xfe,1024*1024*16);

printf("Athlon test program %s \n",cvsid);

printf("\n");
test_copypage(Buffer);

free(Buffer);

return 0;
}


Attachments:
athlon.c (12.90 kB)

2002-10-24 17:35:19

by Andreas Steinmetz

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Athlon TB 900/VIA KT133

titanic:/tmp # cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 4
model name : AMD Athlon(tm) Processor
stepping : 2
cpu MHz : 902.075
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 1795.68

titanic:/tmp # gcc -O3 -s -o athlon athlon.c
titanic:/tmp # ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13359 cycles per page
copy_page function '2.4 non MMX' took 20749 cycles per page
copy_page function '2.4 MMX fallback' took 20737 cycles per page
copy_page function '2.4 MMX version' took 13545 cycles per page
copy_page function 'faster_copy' took 8132 cycles per page
copy_page function 'even_faster' took 8123 cycles per page
copy_page function 'no_prefetch' took 7648 cycles per page
titanic:/tmp # ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13398 cycles per page
copy_page function '2.4 non MMX' took 20774 cycles per page
copy_page function '2.4 MMX fallback' took 20749 cycles per page
copy_page function '2.4 MMX version' took 13349 cycles per page
copy_page function 'faster_copy' took 8130 cycles per page
copy_page function 'even_faster' took 8168 cycles per page
copy_page function 'no_prefetch' took 7631 cycles per page
titanic:/tmp # ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13470 cycles per page
copy_page function '2.4 non MMX' took 20780 cycles per page
copy_page function '2.4 MMX fallback' took 20784 cycles per page
copy_page function '2.4 MMX version' took 13384 cycles per page
copy_page function 'faster_copy' took 8172 cycles per page
copy_page function 'even_faster' took 8137 cycles per page
copy_page function 'no_prefetch' took 7633 cycles per page
titanic:/tmp # ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13377 cycles per page
copy_page function '2.4 non MMX' took 20764 cycles per page
copy_page function '2.4 MMX fallback' took 20831 cycles per page
copy_page function '2.4 MMX version' took 13336 cycles per page
copy_page function 'faster_copy' took 8140 cycles per page
copy_page function 'even_faster' took 8131 cycles per page
copy_page function 'no_prefetch' took 7670 cycles per page

--
Andreas Steinmetz
D.O.M. Datenverarbeitung GmbH

2002-10-24 17:31:25

by Robert Love

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, 2002-10-24 at 13:15, Manfred Spraul wrote:

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

Hi Manfred. Below is the average of three runs.

Dual Athlon 1600, AMD 760M chipset, 2GB of ECC DDR266.

Looks like AMD is right :)

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18622 cycles per page
copy_page function '2.4 non MMX' took 21086 cycles per page
copy_page function '2.4 MMX fallback' took 21096 cycles per page
copy_page function '2.4 MMX version' took 18498 cycles per page
copy_page function 'faster_copy' took 10311 cycles per page
copy_page function 'even_faster' took 10464 cycles per page
copy_page function 'no_prefetch' took 8589 cycles per page

Robert Love

2002-10-24 17:42:42

by Matthias Welk

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thursday 24 October 2002 19:15, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.
>
> --
> Manfred

Running on an Athlon XP2000+, ASUS A7V333, 768MB DDR2100:

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18132 cycles per page
copy_page function '2.4 non MMX' took 25200 cycles per page
copy_page function '2.4 MMX fallback' took 19369 cycles per page
copy_page function '2.4 MMX version' took 18078 cycles per page
copy_page function 'faster_copy' took 11343 cycles per page
copy_page function 'even_faster' took 11203 cycles per page
copy_page function 'no_prefetch' took 7814 cycles per page
1019 [maw] (buruk) /tmp/athlon # athlon_test
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18081 cycles per page
copy_page function '2.4 non MMX' took 19487 cycles per page
copy_page function '2.4 MMX fallback' took 19403 cycles per page
copy_page function '2.4 MMX version' took 18086 cycles per page
copy_page function 'faster_copy' took 11372 cycles per page
copy_page function 'even_faster' took 11183 cycles per page
copy_page function 'no_prefetch' took 7815 cycles per page
1020 [maw] (buruk) /tmp/athlon # athlon_test
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18081 cycles per page
copy_page function '2.4 non MMX' took 19487 cycles per page
copy_page function '2.4 MMX fallback' took 19453 cycles per page
copy_page function '2.4 MMX version' took 18063 cycles per page
copy_page function 'faster_copy' took 11335 cycles per page
copy_page function 'even_faster' took 11154 cycles per page
copy_page function 'no_prefetch' took 8332 cycles per page

Greeting, Matthias.
--
---------------------------------------------------------------
From: Matthias Welk office: +49-30-3463-7272
FhG-FOKUS mobile: +49-179- 1144752
Kaiserin-Augusta-Allee 31 fax : +49-30-3463-8672
10589 Berlin email : [email protected]
---------------------------------------------------------------


2002-10-24 17:46:51

by Roger Luethi

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Athlon 1400, ALi chipset, 1 GB SDRAM

Deviation in 3 runs < 1%.

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 29353 cycles per page
copy_page function '2.4 non MMX' took 34621 cycles per page
copy_page function '2.4 MMX fallback' took 34606 cycles per page
copy_page function '2.4 MMX version' took 29239 cycles per page
copy_page function 'faster_copy' took 17236 cycles per page
copy_page function 'even_faster' took 17453 cycles per page
copy_page function 'no_prefetch' took 12628 cycles per page

2002-10-24 17:59:37

by Zach Brown

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

> On Thu, 2002-10-24 at 13:15, Manfred Spraul wrote:
>
> > Attached is a test app that compares several memory copy implementations.
> > Could you run it and report the results to me, together with cpu,
> > chipset and memory type?

CPU0: AMD Athlon(tm) MP 1800+ stepping 02

in a tyan tiger mpx (amd762 north bridge), two 512m non-buffered
pc2100 ddr sticks, in a mostly-idle dual workstation:

copy_page() tests
copy_page function 'warm up run' took 16543 cycles per page
copy_page function '2.4 non MMX' took 18241 cycles per page
copy_page function '2.4 MMX fallback' took 18144 cycles per page
copy_page function '2.4 MMX version' took 16551 cycles per page
copy_page function 'faster_copy' took 10099 cycles per page
copy_page function 'even_faster' took 10218 cycles per page
copy_page function 'no_prefetch' took 9618 cycles per page

copy_page() tests
copy_page function 'warm up run' took 16618 cycles per page
copy_page function '2.4 non MMX' took 18274 cycles per page
copy_page function '2.4 MMX fallback' took 18126 cycles per page
copy_page function '2.4 MMX version' took 16649 cycles per page
copy_page function 'faster_copy' took 10100 cycles per page
copy_page function 'even_faster' took 10219 cycles per page
copy_page function 'no_prefetch' took 9571 cycles per page

copy_page() tests
copy_page function 'warm up run' took 16571 cycles per page
copy_page function '2.4 non MMX' took 18265 cycles per page
copy_page function '2.4 MMX fallback' took 18076 cycles per page
copy_page function '2.4 MMX version' took 16558 cycles per page
copy_page function 'faster_copy' took 10112 cycles per page
copy_page function 'even_faster' took 10207 cycles per page
copy_page function 'no_prefetch' took 9582 cycles per page

- z

2002-10-24 18:11:34

by Eric Lammerts

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


On Thu, 24 Oct 2002, Manfred Spraul wrote:
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

vendor_id : AuthenticAMD
cpu family : 6
model : 3
model name : AMD Duron(tm) Processor
stepping : 1
cpu MHz : 841.223 <--- 8 * 105MHz FSB
cache size : 64 KB

00:00.0 Host bridge: VIA Technologies, Inc. VT8363/8365 [KT133/KM133] (rev 03)
00:01.0 PCI bridge: VIA Technologies, Inc. VT8363/8365 [KT133/KM133 AGP]
00:07.0 ISA bridge: VIA Technologies, Inc. VT82C686 [Apollo Super South] (rev 22)

Memory: 256 + 128Mb PC133 SDRAM


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18450 cycles per page
copy_page function '2.4 non MMX' took 22432 cycles per page
copy_page function '2.4 MMX fallback' took 22448 cycles per page
copy_page function '2.4 MMX version' took 17096 cycles per page
copy_page function 'faster_copy' took 11092 cycles per page
copy_page function 'even_faster' took 10770 cycles per page
copy_page function 'no_prefetch' took 10323 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 17674 cycles per page
copy_page function '2.4 non MMX' took 21895 cycles per page
copy_page function '2.4 MMX fallback' took 21774 cycles per page
copy_page function '2.4 MMX version' took 17683 cycles per page
copy_page function 'faster_copy' took 10954 cycles per page
copy_page function 'even_faster' took 10697 cycles per page
copy_page function 'no_prefetch' took 10309 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 17985 cycles per page
copy_page function '2.4 non MMX' took 22498 cycles per page
copy_page function '2.4 MMX fallback' took 21063 cycles per page
copy_page function '2.4 MMX version' took 17415 cycles per page
copy_page function 'faster_copy' took 12003 cycles per page
copy_page function 'even_faster' took 11297 cycles per page
copy_page function 'no_prefetch' took 10440 cycles per page

Eric


2002-10-24 18:15:49

by Daniel Egger

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Am Don, 2002-10-24 um 19.15 schrieb Manfred Spraul:

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

SiS 735, Duron 1200, 512 MB PC133 (running at 100Mhz).

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 19860 cycles per page
copy_page function '2.4 non MMX' took 21205 cycles per page
copy_page function '2.4 MMX fallback' took 21262 cycles per page
copy_page function '2.4 MMX version' took 19893 cycles per page
copy_page function 'faster_copy' took 12746 cycles per page
copy_page function 'even_faster' took 13112 cycles per page
copy_page function 'no_prefetch' took 10217 cycles per page

Being interested in seeing how the Via Ezra system here performs I also
ran it there but experienced three segfaults in the last three tests;
two of which I can explain, but no_prefetch is a stranger right now.
Anyway:

PLE133, Via Ezra 667 Mhz, 128 MB PC100 (probably at 66Mhz)
egger@tanja:~$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 23213 cycles per page
copy_page function '2.4 non MMX' took 34971 cycles per page
copy_page function '2.4 MMX fallback' took 34958 cycles per page
copy_page function '2.4 MMX version' took 22774 cycles per page

--
Servus,
Daniel


Attachments:
signature.asc (189.00 B)
Dies ist ein digital signierter Nachrichtenteil

2002-10-24 18:20:37

by David Rees

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Ran on two machines. Ran 10 times each, removed high and low, averaged the
rest.

Machine 1, Duron 600, KT133 chipset, 512MB PC100 memory

'warm up run' 11494.25
'2.4 non MMX' 16536.625
'2.4 MMX fallback' 16559.375
'2.4 MMX version' 11463.75
'faster_copy' 6757
'even_faster' 6620.375
'no_prefetch' 5996.5

> cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 3
model name : AMD Duron(tm) Processor
stepping : 0
cpu MHz : 605.410
cache size : 64 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 1205.86
> cat /proc/pci
PCI devices found:
Bus 0, device 0, function 0:
Host bridge: VIA Technologies, Inc. VT8363/8365 [KT133/KM133] (rev 2).
Master Capable. Latency=8.
Prefetchable 32 bit memory at 0xe4000000 [0xe7ffffff].
Bus 0, device 1, function 0:
PCI bridge: VIA Technologies, Inc. VT8363/8365 [KT133/KM133 AGP] (rev 0).
Master Capable. No bursts. Min Gnt=8.
Bus 0, device 4, function 0:
ISA bridge: VIA Technologies, Inc. VT82C686 [Apollo Super South] (rev 34).


Machine 2, Original Athlon 700, Via Apollo Pro133 chipset, 512MB PC100 memory
(note: this machine wasn't totally idle during testing)

'warm up run' 15621.875
'2.4 non MMX' 22805
'2.4 MMX fallback' 19881.75
'2.4 MMX version' 15237.5
'faster_copy' 8985.5
'even_faster' 9134.25
'no_prefetch' 7960.5

> cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 2
model name : AMD Athlon(tm) Processor
stepping : 1
cpu MHz : 700.057
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 1395.91
cat /proc/pci
PCI devices found:
Bus 0, device 0, function 0:
Host bridge: VIA Technologies, Inc. VT82C693A/694x [Apollo PRO133x] (rev 2).
Prefetchable 32 bit memory at 0xe4000000 [0xe7ffffff].
Bus 0, device 1, function 0:
PCI bridge: VIA Technologies, Inc. VT82C598/694x [Apollo MVP3/Pro133x AGP] (rev 0).
Bus 0, device 4, function 0:
ISA bridge: VIA Technologies, Inc. VT82C686 [Apollo Super South] (rev 34).

2002-10-24 18:16:47

by Shawn Starr

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

I'll run this when I get home. I have a Athlon MP 2000+ (with one CPU only right now).

Shawn.

--
Shawn Starr
UNIX Systems Administrator, Operations
Datawire Communication Networks Inc.
10 Carlson Court, Suite 300
Toronto, ON, M9W 6L2
T: 416-213-2001 ext 179 F: 416-213-2008
[email protected]
"The power to Transact" - http://www.datawire.net

2002-10-24 18:29:47

by Josh McKinney

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On approximately Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:

> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) XP 1800+
stepping : 2
cpu MHz : 1529.541
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3053.97

RAM is PC2100 DDR, Mobo/Chipset Soyo K7V Dragon+ VIA KT266A

$ gcc-3.2 -o athlon-memcpy -O3 -march=athlon-xp athlon.c

$ ./athlon-memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16159 cycles per page
copy_page function '2.4 non MMX' took 16867 cycles per page
copy_page function '2.4 MMX fallback' took 16486 cycles per page
copy_page function '2.4 MMX version' took 16116 cycles per page
copy_page function 'faster_copy' took 9679 cycles per page
copy_page function 'even_faster' took 9708 cycles per page
copy_page function 'no_prefetch' took 6879 cycles per page

$ ./athlon-memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18627 cycles per page
copy_page function '2.4 non MMX' took 21079 cycles per page
copy_page function '2.4 MMX fallback' took 21081 cycles per page
copy_page function '2.4 MMX version' took 18658 cycles per page
copy_page function 'faster_copy' took 11334 cycles per page
copy_page function 'even_faster' took 11606 cycles per page
copy_page function 'no_prefetch' took 6925 cycles per page

$ ./athlon-memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18631 cycles per page
copy_page function '2.4 non MMX' took 21015 cycles per page
copy_page function '2.4 MMX fallback' took 21085 cycles per page
copy_page function '2.4 MMX version' took 18619 cycles per page
copy_page function 'faster_copy' took 11388 cycles per page
copy_page function 'even_faster' took 11478 cycles per page
copy_page function 'no_prefetch' took 6961 cycles per page

$ ./athlon-memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 17246 cycles per page
copy_page function '2.4 non MMX' took 18617 cycles per page
copy_page function '2.4 MMX fallback' took 18319 cycles per page
copy_page function '2.4 MMX version' took 17235 cycles per page
copy_page function 'faster_copy' took 10356 cycles per page
copy_page function 'even_faster' took 10462 cycles per page
copy_page function 'no_prefetch' took 6889 cycles per page


2002-10-24 18:28:15

by Dave Jones

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 2
model name : AMD Athlon(tm) Processor
stepping : 1
cpu MHz : 800.034
cache size : 512 KB

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 12570 cycles per page
copy_page function '2.4 non MMX' took 18763 cycles per page
copy_page function '2.4 MMX fallback' took 18764 cycles per page
copy_page function '2.4 MMX version' took 12564 cycles per page
copy_page function 'faster_copy' took 8001 cycles per page
copy_page function 'even_faster' took 7362 cycles per page
copy_page function 'no_prefetch' took 7536 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 12583 cycles per page
copy_page function '2.4 non MMX' took 18768 cycles per page
copy_page function '2.4 MMX fallback' took 21556 cycles per page
copy_page function '2.4 MMX version' took 12636 cycles per page
copy_page function 'faster_copy' took 7375 cycles per page
copy_page function 'even_faster' took 7368 cycles per page
copy_page function 'no_prefetch' took 7552 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 12562 cycles per page
copy_page function '2.4 non MMX' took 18755 cycles per page
copy_page function '2.4 MMX fallback' took 21687 cycles per page
copy_page function '2.4 MMX version' took 12604 cycles per page
copy_page function 'faster_copy' took 7358 cycles per page
copy_page function 'even_faster' took 7356 cycles per page
copy_page function 'no_prefetch' took 7566 cycles per page


00:00.0 Host bridge: VIA Technologies, Inc. VT82C693A/694x [Apollo PRO133x] (rev 02)
00:01.0 PCI bridge: VIA Technologies, Inc. VT82C598/694x [Apollo MVP3/Pro133x AGP]
00:04.0 ISA bridge: VIA Technologies, Inc. VT82C686 [Apollo Super South] (rev 22)
00:04.1 IDE interface: VIA Technologies, Inc. Bus Master IDE (rev 10)
00:04.2 USB Controller: VIA Technologies, Inc. USB (rev 10)
00:04.3 USB Controller: VIA Technologies, Inc. USB (rev 10)
00:04.4 Host bridge: VIA Technologies, Inc. VT82C686 [Apollo Super ACPI] (rev 30)

Memory type is unbranded PC133 SDRAM

Dave

--
| Dave Jones. http://www.codemonkey.org.uk

2002-10-24 18:37:20

by Simon Fowler

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.
>
simon@caccini:~/hacking$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 1
model name : AMD-K7(tm) Processor
stepping : 2
cpu MHz : 553.880
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov pat mmx syscall mmxext 3dnowext 3dnow
bogomips : 1104.28

simon@caccini:~/hacking$ ./athlon; ./athlon; ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 12855 cycles per page
copy_page function '2.4 non MMX' took 17267 cycles per page
copy_page function '2.4 MMX fallback' took 14930 cycles per page
copy_page function '2.4 MMX version' took 10642 cycles per page
copy_page function 'faster_copy' took 10591 cycles per page
copy_page function 'even_faster' took 13035 cycles per page
copy_page function 'no_prefetch' took 11657 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 12871 cycles per page
copy_page function '2.4 non MMX' took 18482 cycles per page
copy_page function '2.4 MMX fallback' took 15013 cycles per page
copy_page function '2.4 MMX version' took 10679 cycles per page
copy_page function 'faster_copy' took 12268 cycles per page
copy_page function 'even_faster' took 10789 cycles per page
copy_page function 'no_prefetch' took 11691 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13110 cycles per page
copy_page function '2.4 non MMX' took 14958 cycles per page
copy_page function '2.4 MMX fallback' took 14952 cycles per page
copy_page function '2.4 MMX version' took 12864 cycles per page
copy_page function 'faster_copy' took 10581 cycles per page
copy_page function 'even_faster' took 10629 cycles per page
copy_page function 'no_prefetch' took 11607 cycles per page

Simon

--
PGP public key Id 0x144A991C, or http://himi.org/stuff/himi.asc
(crappy) Homepage: http://himi.org
doe #237 (see http://www.lemuria.org/DeCSS)
My DeCSS mirror: ftp://himi.org/pub/mirrors/css/


Attachments:
(No filename) (2.86 kB)
(No filename) (232.00 B)
Download all attachments

2002-10-24 18:48:30

by Dave Jones

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Fri, Oct 25, 2002 at 04:43:28AM +1000, Simon Fowler wrote:
> <deletia>
>
> copy_page() tests
> copy_page function 'warm up run' took 12855 cycles per page
> copy_page function '2.4 non MMX' took 17267 cycles per page
> copy_page function '2.4 MMX fallback' took 14930 cycles per page
> copy_page function '2.4 MMX version' took 10642 cycles per page
> copy_page function 'faster_copy' took 10591 cycles per page
> copy_page function 'even_faster' took 13035 cycles per page
> copy_page function 'no_prefetch' took 11657 cycles per page
> Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
>
> copy_page() tests
> copy_page function 'warm up run' took 12871 cycles per page
> copy_page function '2.4 non MMX' took 18482 cycles per page
> copy_page function '2.4 MMX fallback' took 15013 cycles per page
> copy_page function '2.4 MMX version' took 10679 cycles per page
> copy_page function 'faster_copy' took 12268 cycles per page
> copy_page function 'even_faster' took 10789 cycles per page
> copy_page function 'no_prefetch' took 11691 cycles per page
> Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
>
> copy_page() tests
> copy_page function 'warm up run' took 13110 cycles per page
> copy_page function '2.4 non MMX' took 14958 cycles per page
> copy_page function '2.4 MMX fallback' took 14952 cycles per page
> copy_page function '2.4 MMX version' took 12864 cycles per page
> copy_page function 'faster_copy' took 10581 cycles per page
> copy_page function 'even_faster' took 10629 cycles per page
> copy_page function 'no_prefetch' took 11607 cycles per page

Wow. The 612 really sucked badly here. I think this is the only
time I've seen 'even_faster' lose. A few people (myself included)
have in the past talked about making the memory copy routines
do a boot time benchmark somewhat like the RAID code does to deduce
the best. Seeing results like this makes me really believe this is
the way forward.

With something like that inplace, we could then have seperate
implementations for each processor revision if needbe without
pessimising for earlier revisions.

Dave

--
| Dave Jones. http://www.codemonkey.org.uk

2002-10-24 18:44:16

by Simon Fowler

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Fri, Oct 25, 2002 at 04:43:28AM +1000, Simon Fowler wrote:
> On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> > AMD recommends to perform memory copies with backward read operations
> > instead of prefetch.
> >
> > http://208.15.46.63/events/gdc2002.htm
> >
> > Attached is a test app that compares several memory copy implementations.
> > Could you run it and report the results to me, together with cpu,
> > chipset and memory type?
> >
> > Please run 2 or 3 times.
> >
> simon@caccini:~/hacking$ cat /proc/cpuinfo
> processor : 0
> vendor_id : AuthenticAMD
> cpu family : 6
> model : 1
> model name : AMD-K7(tm) Processor
> stepping : 2
> cpu MHz : 553.880
> cache size : 512 KB
> fdiv_bug : no
> hlt_bug : no
> f00f_bug : no
> coma_bug : no
> fpu : yes
> fpu_exception : yes
> cpuid level : 1
> wp : yes
> flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov pat mmx syscall mmxext 3dnowext 3dnow
> bogomips : 1104.28
>
00:00.0 Host bridge: Advanced Micro Devices [AMD] AMD-751 [Irongate] System Controller (rev 25)
00:01.0 PCI bridge: Advanced Micro Devices [AMD] AMD-751 [Irongate] AGP Bridge (rev 01)

Generic PC100 and PC133 DIMM - both 256MB.

Simon

--
PGP public key Id 0x144A991C, or http://himi.org/stuff/himi.asc
(crappy) Homepage: http://himi.org
doe #237 (see http://www.lemuria.org/DeCSS)
My DeCSS mirror: ftp://himi.org/pub/mirrors/css/


Attachments:
(No filename) (1.49 kB)
(No filename) (232.00 B)
Download all attachments

2002-10-24 18:42:29

by Ernst Herzberg

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Donnerstag, 24. Oktober 2002 19:15, Manfred Spraul wrote:

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

CPU: AMD Athlon(tm) XP 1800+ stepping 02
00:00.0 Host bridge: VIA Technologies, Inc.: Unknown device 3116
00:01.0 PCI bridge: VIA Technologies, Inc. VT8633 [Apollo Pro266 AGP]
512 MB DDR266 (Mem FSB changed to 100MHz due to memory problems)

copy_page() tests
copy_page function 'warm up run' took 20103 cycles per page
copy_page function '2.4 non MMX' took 22612 cycles per page
copy_page function '2.4 MMX fallback' took 22585 cycles per page
copy_page function '2.4 MMX version' took 20088 cycles per page
copy_page function 'faster_copy' took 12198 cycles per page
copy_page function 'even_faster' took 12266 cycles per page
copy_page function 'no_prefetch' took 9244 cycles per page
earny@dev:~/x> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 20051 cycles per page
copy_page function '2.4 non MMX' took 22580 cycles per page
copy_page function '2.4 MMX fallback' took 22610 cycles per page
copy_page function '2.4 MMX version' took 20124 cycles per page
copy_page function 'faster_copy' took 12276 cycles per page
copy_page function 'even_faster' took 12262 cycles per page
copy_page function 'no_prefetch' took 9213 cycles per page
earny@dev:~/x> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 20150 cycles per page
copy_page function '2.4 non MMX' took 22646 cycles per page
copy_page function '2.4 MMX fallback' took 22638 cycles per page
copy_page function '2.4 MMX version' took 20073 cycles per page
copy_page function 'faster_copy' took 12191 cycles per page
copy_page function 'even_faster' took 12261 cycles per page
copy_page function 'no_prefetch' took 9218 cycles per page

------------------

Wow!

<Earny>

2002-10-24 18:56:01

by Erich Boleyn

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


Matthias Welk <[email protected]> wrote:

> Running on an Athlon XP2000+, ASUS A7V333, 768MB DDR2100:

...[snip]...

> 1019 [maw] (buruk) /tmp/athlon # athlon_test
> Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
>
> copy_page() tests
> copy_page function 'warm up run' took 18081 cycles per page
> copy_page function '2.4 non MMX' took 19487 cycles per page
> copy_page function '2.4 MMX fallback' took 19403 cycles per page
> copy_page function '2.4 MMX version' took 18086 cycles per page
> copy_page function 'faster_copy' took 11372 cycles per page
> copy_page function 'even_faster' took 11183 cycles per page
> copy_page function 'no_prefetch' took 7815 cycles per page
> 1020 [maw] (buruk) /tmp/athlon # athlon_test


Whoa! Hmm.

If I'm reading this right, with a processor speed of 1.666 GHz,
you're getting:

(4096 bytes / 7815 clocks) * 1.666 GHz = 873 MB/sec

The perfect peak performance of your setup, if the cache implements
standard write-allocate behavior (the target cache line is read before it
is written because the write logic doesn't know you're going to overwrite
the whole line in cases like this), should be:

MIN( Memory speed / FSB speed ) / 3 = 700 MB/sec


So what gives? Did I misinterpret the output of your program?
Is the test flawed?

--
Erich Stefan Boleyn <[email protected]> http://www.uruk.org/
"Reality is truly stranger than fiction; Probably why fiction is so popular"

2002-10-24 19:05:51

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 12:01:54PM -0700, [email protected] wrote:
> The perfect peak performance of your setup, if the cache implements
> standard write-allocate behavior (the target cache line is read before it
> is written because the write logic doesn't know you're going to overwrite
> the whole line in cases like this), should be:

the point is to avoid the (in this case bad) write allocate...

2002-10-24 19:05:33

by Marcus Libäck

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, 2002-10-24 at 19:15, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.
>
> --
> Manfred
> ----
>

[phuse@buffy:~/files]$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) XP 1900+
stepping : 2
cpu MHz : 1601.986
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de tsc msr pae mce cx8 sep mtrr pge mca cmov
pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3198.15

[phuse@buffy:~/files]$ gcc athlon.c -o athlon-mem ; ./athlon-mem ;
../athlon-mem ; ./athlon-mem
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 21174 cycles per page
copy_page function '2.4 non MMX' took 23369 cycles per page
copy_page function '2.4 MMX fallback' took 23134 cycles per page
copy_page function '2.4 MMX version' took 20586 cycles per page
copy_page function 'faster_copy' took 12297 cycles per page
copy_page function 'even_faster' took 11697 cycles per page
copy_page function 'no_prefetch' took 8664 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 20611 cycles per page
copy_page function '2.4 non MMX' took 23508 cycles per page
copy_page function '2.4 MMX fallback' took 23396 cycles per page
copy_page function '2.4 MMX version' took 20350 cycles per page
copy_page function 'faster_copy' took 12199 cycles per page
copy_page function 'even_faster' took 11443 cycles per page
copy_page function 'no_prefetch' took 8739 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 20362 cycles per page
copy_page function '2.4 non MMX' took 24304 cycles per page
copy_page function '2.4 MMX fallback' took 23258 cycles per page
copy_page function '2.4 MMX version' took 20307 cycles per page
copy_page function 'faster_copy' took 11379 cycles per page
copy_page function 'even_faster' took 11388 cycles per page
copy_page function 'no_prefetch' took 8800 cycles per page


--
Regards / Med v?nlig h?lsning:
Marcus Lib?ck <[email protected]>

2002-10-24 19:09:07

by Florin Iucha

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

SIS735 (ECS7S5A mobo)
Duron 1200 MHz
512 MB PC100

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 24171 cycles per page
copy_page function '2.4 non MMX' took 25359 cycles per page
copy_page function '2.4 MMX fallback' took 25224 cycles per page
copy_page function '2.4 MMX version' took 24149 cycles per page
copy_page function 'faster_copy' took 15660 cycles per page
copy_page function 'even_faster' took 15540 cycles per page
copy_page function 'no_prefetch' took 13853 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 24325 cycles per page
copy_page function '2.4 non MMX' took 25414 cycles per page
copy_page function '2.4 MMX fallback' took 25317 cycles per page
copy_page function '2.4 MMX version' took 24345 cycles per page
copy_page function 'faster_copy' took 15718 cycles per page
copy_page function 'even_faster' took 15553 cycles per page
copy_page function 'no_prefetch' took 13855 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 24225 cycles per page
copy_page function '2.4 non MMX' took 25430 cycles per page
copy_page function '2.4 MMX fallback' took 25398 cycles per page
copy_page function '2.4 MMX version' took 24233 cycles per page
copy_page function 'faster_copy' took 15737 cycles per page
copy_page function 'even_faster' took 15584 cycles per page
copy_page function 'no_prefetch' took 13855 cycles per page

--

"If it's not broken, let's fix it till it is."

41A9 2BDE 8E11 F1C5 87A6 03EE 34B3 E075 3B90 DFE4


Attachments:
(No filename) (1.81 kB)
(No filename) (189.00 B)
Download all attachments

2002-10-24 19:13:59

by Brian Gerst

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.
>

Athlon XP 1600+ (1400 MHz)
512 MB PC-133 memory
Host bridge: VIA Technologies, Inc. VT8363/8365 [KT133/KM133] (rev 03)

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) Processor
stepping : 2
cpu MHz : 1410.668
cache size : 256 KB

copy_page() tests
copy_page function 'warm up run' took 21428 cycles per page
copy_page function '2.4 non MMX' took 22404 cycles per page
copy_page function '2.4 MMX fallback' took 22426 cycles per page
copy_page function '2.4 MMX version' took 21472 cycles per page
copy_page function 'faster_copy' took 13618 cycles per page
copy_page function 'even_faster' took 13284 cycles per page
copy_page function 'no_prefetch' took 11943 cycles per page

copy_page() tests
copy_page function 'warm up run' took 21640 cycles per page
copy_page function '2.4 non MMX' took 22865 cycles per page
copy_page function '2.4 MMX fallback' took 22843 cycles per page
copy_page function '2.4 MMX version' took 21597 cycles per page
copy_page function 'faster_copy' took 13751 cycles per page
copy_page function 'even_faster' took 13407 cycles per page
copy_page function 'no_prefetch' took 11952 cycles per page

copy_page() tests
copy_page function 'warm up run' took 21681 cycles per page
copy_page function '2.4 non MMX' took 22900 cycles per page
copy_page function '2.4 MMX fallback' took 22999 cycles per page
copy_page function '2.4 MMX version' took 21679 cycles per page
copy_page function 'faster_copy' took 13782 cycles per page
copy_page function 'even_faster' took 13481 cycles per page
copy_page function 'no_prefetch' took 11969 cycles per page



2002-10-24 19:22:28

by Manfred Spraul

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

/*

(C) 2000 Arjan van de Ven and others licensed under the terms of the GPL


$Revision: 1.6 $
*/

static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $";
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* The 2.4 kernel one, adapted for userspace */

static void fast_clear_page(void *page)
{
int i;
char fpu_save[108];

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/128;i++)
{
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
" movq %%mm0, 16(%0)\n"
" movq %%mm0, 24(%0)\n"
" movq %%mm0, 32(%0)\n"
" movq %%mm0, 40(%0)\n"
" movq %%mm0, 48(%0)\n"
" movq %%mm0, 56(%0)\n"
" movq %%mm0, 64(%0)\n"
" movq %%mm0, 72(%0)\n"
" movq %%mm0, 80(%0)\n"
" movq %%mm0, 88(%0)\n"
" movq %%mm0, 96(%0)\n"
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
: : "r" (page) : "memory");
page+=128;
}
__asm__ __volatile__ (
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* modified version for Athlon-family processors */
static void faster_clear_page(void *page)
{
int i;
char fpu_save[108];

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/64;i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
" movntq %%mm0, 16(%0)\n"
" movntq %%mm0, 24(%0)\n"
" movntq %%mm0, 32(%0)\n"
" movntq %%mm0, 40(%0)\n"
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
page+=64;
}
__asm__ __volatile__ (
" sfence \n "
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
* but serves as my playground.
*/
static void even_faster_clear_page(void *page)
{
int i;
char fpu_save[108];
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);

for(i=0;i<4096/64;i++)
{
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
" movntq %%mm0, 16(%0)\n"
" movntq %%mm0, 24(%0)\n"
" movntq %%mm0, 32(%0)\n"
" movntq %%mm0, 40(%0)\n"
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
page+=64;
}
__asm__ __volatile__ (
" sfence \n "
" femms\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* The "fallback" one as used by the kernel */
static void slow_zero_page(void * page)
{
int d0, d1;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; stosl" \
: "=&c" (d0), "=&D" (d1)
:"a" (0),"1" (page),"0" (1024)
:"memory");
}

static void slow_copy_page(void *to, void *from)
{
int d0, d1, d2;
__asm__ __volatile__( \
"cld\n\t" \
"rep ; movsl" \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (1024),"1" ((long) to),"2" ((long) from) \
: "memory");
}


/* 2.4 kernel mmx copy_page function */
static void fast_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

__asm__ __volatile__ (
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
" prefetch 192(%0)\n"
" prefetch 256(%0)\n"
: : "r" (from) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq %%mm0, (%1)\n"
" movq %%mm1, 8(%1)\n"
" movq %%mm2, 16(%1)\n"
" movq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm0\n"
" movq 40(%0), %%mm1\n"
" movq 48(%0), %%mm2\n"
" movq 56(%0), %%mm3\n"
" movq %%mm0, 32(%1)\n"
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms\n" : :
);
__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

}


/* Athlon improved version */
static void faster_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];

__asm__ __volatile__ (
"1: prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
: : "r" (from) );

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
"1: prefetchnta 320(%0)\n"
"2: movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
" movq 16(%0), %%mm2\n"
" movq 24(%0), %%mm3\n"
" movq 32(%0), %%mm4\n"
" movq 40(%0), %%mm5\n"
" movq 48(%0), %%mm6\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm0, (%1)\n"
" movntq %%mm1, 8(%1)\n"
" movntq %%mm2, 16(%1)\n"
" movntq %%mm3, 24(%1)\n"
" movntq %%mm4, 32(%1)\n"
" movntq %%mm5, 40(%1)\n"
" movntq %%mm6, 48(%1)\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms \n "
" sfence\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}

/* test version to go even faster... this might be the same as faster_
* but serves as my playground.
*/
static void even_faster_copy_page(void *to, void *from)
{
int i;
char fpu_save[108];

__asm__ __volatile__ (
"1: prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"
: : "r" (from) );

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++)
{
__asm__ __volatile__ (
" prefetchnta 256(%0)\n"
" movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" femms \n "
" sfence\n" : :
);
__asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );

}


/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
return to;
case 1:
*(unsigned char *)to = *(const unsigned char *)from;
return to;
case 2:
*(unsigned short *)to = *(const unsigned short *)from;
return to;
case 3:
*(unsigned short *)to = *(const unsigned short *)from;
*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
return to;
case 4:
*(unsigned long *)to = *(const unsigned long *)from;
return to;
case 6: /* for Ethernet addresses */
*(unsigned long *)to = *(const unsigned long *)from;
*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
return to;
case 8:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
return to;
case 12:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
return to;
case 16:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
return to;
case 20:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
return to;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}

#undef COMMON
}


static void normal_copy_page(void *to, void *from)
{
__constant_memcpy(to,from,4096);
}


/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
return s;
case 1:
*(unsigned char *)s = pattern;
return s;
case 2:
*(unsigned short *)s = pattern;
return s;
case 3:
*(unsigned short *)s = pattern;
*(2+(unsigned char *)s) = pattern;
return s;
case 4:
*(unsigned long *)s = pattern;
return s;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; stosl" \
x \
: "=&c" (d0), "=&D" (d1) \
: "a" (pattern),"0" (count/4),"1" ((long) s) \
: "memory")
{
int d0, d1;
switch (count % 4) {
case 0: COMMON(""); return s;
case 1: COMMON("\n\tstosb"); return s;
case 2: COMMON("\n\tstosw"); return s;
default: COMMON("\n\tstosw\n\tstosb"); return s;
}
}

#undef COMMON
}

static void normal_clear_page(void *to)
{
__constant_c_and_count_memset(to,0,4096);
}

/* test version to see if we can go even faster */
static void no_prefetch_copy_page(void *to, void *from) {
int i, d1;
char fpu_save[108];

for (i=4096-256;i>=0;i-=256)
__asm__ __volatile(
"movl 192(%1,%2),%0\n"
"movl 128(%1,%2),%0\n"
"movl 64(%1,%2),%0\n"
"movl 0(%1,%2),%0\n"
: "=&r" (d1)
: "r" (from), "r" (i));

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );

for(i=0; i<4096/64; i++) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
" movq 8(%0), %%mm1\n"
" movntq %%mm1, 8(%1)\n"
" movq 16(%0), %%mm2\n"
" movntq %%mm2, 16(%1)\n"
" movq 24(%0), %%mm3\n"
" movntq %%mm3, 24(%1)\n"
" movq 32(%0), %%mm4\n"
" movntq %%mm4, 32(%1)\n"
" movq 40(%0), %%mm5\n"
" movntq %%mm5, 40(%1)\n"
" movq 48(%0), %%mm6\n"
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
__asm__ __volatile__ (
" sfence \n "
" emms\n"
" frstor %0;\n" ::"m"(fpu_save[0]) );
}


#define rdtsc(low,high) \
__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

typedef void (clear_func)(void *);
typedef void (copy_func)(void *,void *);

void test_one_clearpage(clear_func *func, char *name, char *Buffer)
{
char *temp;
int i;
unsigned int blow,bhigh,alow,ahigh;
unsigned long long before,after;

rdtsc(blow,bhigh);
temp = Buffer;
for (i=0;i<4*1024;i++) {
func(temp);
temp += 4096;
}
rdtsc(alow,ahigh);
before = blow + (((long long)bhigh)<<32);
after = alow +(((long long)ahigh)<<32);
if (before>after) {
printf("test invalid; timer overflow \n");
return;
}
printf("clear_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(4*1024) );


}

void test_one_copypage(copy_func *func, char *name, char *Buffer)
{
char *temp;
int i;
unsigned int blow,bhigh,alow,ahigh;
unsigned long long before,after;

sleep(1);
rdtsc(blow,bhigh);
temp = Buffer;
for (i=0;i<2*1024;i++) {
func(temp,temp+8*1024*1024);
temp += 4096;
}
rdtsc(alow,ahigh);
before = blow+ (((long long)bhigh)<<32);
after = alow+(((long long)ahigh)<<32);
if (before>after) {
printf("test invalid; timer overflow \n");
return;
}
printf("copy_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(2*1024) );


}


void test_clearpage(char *Buffer)
{
printf("clear_page() tests \n");

test_one_clearpage(fast_clear_page,"warm up run",Buffer);
test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
}

void test_copypage(char *Buffer)
{
printf("copy_page() tests \n");

test_one_copypage(fast_copy_page, "warm up run",Buffer);
test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
test_one_copypage(slow_copy_page, "2.4 MMX fallback",Buffer);
test_one_copypage(fast_copy_page, "2.4 MMX version",Buffer);
test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
}

int main()
{
char *Buffer;

Buffer = malloc(1024*1024*16);
memset(Buffer,0xfe,1024*1024*16);

printf("Athlon test program %s \n",cvsid);

printf("\n");
test_copypage(Buffer);

free(Buffer);

return 0;
}


Attachments:
via.c (12.78 kB)

2002-10-24 19:31:20

by Dave Jones

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 09:28:47PM +0200, Manfred Spraul wrote:
> Daniel Egger wrote:
>
> >Being interested in seeing how the Via Ezra system here performs I also
> >ran it there but experienced three segfaults in the last three tests;
> >two of which I can explain, but no_prefetch is a stranger right now.
> >Anyway:
> >
> It seems the via cpu doesn't support prefetchnta. Could you try the
> attached version?

More likely its barfing on the movntq.
The VIA Ezra CPUs only have 3dnow.
Ezra-T has 3dnowext iirc.

Dave

--
| Dave Jones. http://www.codemonkey.org.uk

2002-10-24 19:25:53

by Matthias Schniedermeyer

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Ripping out the tests that didn't function on a P-III this is the result:

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function '2.4 non MMX' took 15221 cycles per page
copy_page function '2.4 MMX fallback' took 15090 cycles per page
copy_page function 'no_prefetch' took 12531 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function '2.4 non MMX' took 15053 cycles per page
copy_page function '2.4 MMX fallback' took 17020 cycles per page
copy_page function 'no_prefetch' took 11344 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function '2.4 non MMX' took 14600 cycles per page
copy_page function '2.4 MMX fallback' took 16427 cycles per page
copy_page function 'no_prefetch' took 11822 cycles per page

System:
Dual-PIII
Serverworks HE-SL Chipset
3GB RAM. (2x512MB, 2x1GB (interleaved (AFAIK)))

cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 8
model name : Pentium III (Coppermine)
stepping : 3
cpu MHz : 930.214
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 2
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse
bogomips : 1854.66






Bis denn

--
Real Programmers consider "what you see is what you get" to be just as
bad a concept in Text Editors as it is in women. No, the Real Programmer
wants a "you asked for it, you got it" text editor -- complicated,
cryptic, powerful, unforgiving, dangerous.

2002-10-24 19:26:53

by Pascal Schmidt

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, 24 Oct 2002 19:20:13 +0200, you wrote in linux.kernel:
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

CPU: AMD Duron Spitfire 1 GHz
Chipset: ALi 1647 (Magik 1)
Memory: 2x 256 MB DDR-SDRAM at 200 MHz

> Please run 2 or 3 times.

CFLAGS="-march=athlon" make athlon

[pharao90@neptune (ttyp2) ~]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 20604 cycles per page
copy_page function '2.4 non MMX' took 26798 cycles per page
copy_page function '2.4 MMX fallback' took 27143 cycles per page
copy_page function '2.4 MMX version' took 20835 cycles per page
copy_page function 'faster_copy' took 12324 cycles per page
copy_page function 'even_faster' took 12615 cycles per page
copy_page function 'no_prefetch' took 10842 cycles per page
[pharao90@neptune (ttyp2) ~]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 25284 cycles per page
copy_page function '2.4 non MMX' took 31419 cycles per page
copy_page function '2.4 MMX fallback' took 30724 cycles per page
copy_page function '2.4 MMX version' took 25246 cycles per page
copy_page function 'faster_copy' took 15462 cycles per page
copy_page function 'even_faster' took 16504 cycles per page
copy_page function 'no_prefetch' took 10179 cycles per page
[pharao90@neptune (ttyp2) ~]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 22943 cycles per page
copy_page function '2.4 non MMX' took 30414 cycles per page
copy_page function '2.4 MMX fallback' took 30703 cycles per page
copy_page function '2.4 MMX version' took 23345 cycles per page
copy_page function 'faster_copy' took 14878 cycles per page
copy_page function 'even_faster' took 14902 cycles per page
copy_page function 'no_prefetch' took 10872 cycles per page

Note that even_faster is always slower than faster_copy. ;)

--
Ciao,
Pascal

2002-10-24 19:32:18

by Manfred Spraul

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

[email protected] wrote:

>>copy_page() tests
>>copy_page function 'warm up run' took 18081 cycles per page
>>copy_page function '2.4 non MMX' took 19487 cycles per page
>>copy_page function '2.4 MMX fallback' took 19403 cycles per page
>>copy_page function '2.4 MMX version' took 18086 cycles per page
>>copy_page function 'faster_copy' took 11372 cycles per page
>>copy_page function 'even_faster' took 11183 cycles per page
>>copy_page function 'no_prefetch' took 7815 cycles per page
>>1020 [maw] (buruk) /tmp/athlon # athlon_test
>>
>>
>
>
>Whoa! Hmm.
>
>If I'm reading this right, with a processor speed of 1.666 GHz,
>you're getting:
>
> (4096 bytes / 7815 clocks) * 1.666 GHz = 873 MB/sec
>
>The perfect peak performance of your setup, if the cache implements
>standard write-allocate behavior (the target cache line is read before it
>is written because the write logic doesn't know you're going to overwrite
>the whole line in cases like this), should be:
>
>
There is no write allocate.

There are 2 optimizations for bulk memory copy:
- avoid the write allocate. Possible with the mmx or sse non-temporal
cache hints
* already in the kernel. Difference between MMX and faster_copy
- avoid dram page misses, and stream from the memory chips with maximum
efficiency.
* new optimization. "prefetch" is a hint for the cpu that the
program might need the memory
If I understand the AMD document correctly, then this is not
what's needed for bulk
memory copy: we know that we'll need that cacheline. Thus a real
read, to force the cpu to
fetch the cacheline, even if all read buffers are occupied.

--
Manfred


2002-10-24 19:33:14

by Olaf Dietsche

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Manfred Spraul <[email protected]> writes:

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

Duron 1200, Elitegroup K7VTA3, 512 MB (DDR PC266 CL2.5)

$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 7
model name : AMD Duron(tm) processor
stepping : 1
cpu MHz : 1200.023
cache size : 64 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mp mmxext 3dnowext 3dnow
bogomips : 2367.48

$ lspci
00:00.0 Host bridge: VIA Technologies, Inc. VT8367 [KT266]
00:01.0 PCI bridge: VIA Technologies, Inc. VT8367 [KT266 AGP]
00:11.0 ISA bridge: VIA Technologies, Inc. VT8233 PCI to ISA Bridge
00:11.1 IDE interface: VIA Technologies, Inc. Bus Master IDE (rev 06)

$ gcc -v
Reading specs from /usr/lib/gcc-lib/i386-linux/2.95.4/specs
gcc version 2.95.4 20011002 (Debian prerelease)

$ gcc athlon.c -o athlon
$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 15795 cycles per page
copy_page function '2.4 non MMX' took 16958 cycles per page
copy_page function '2.4 MMX fallback' took 16862 cycles per page
copy_page function '2.4 MMX version' took 15792 cycles per page
copy_page function 'faster_copy' took 9576 cycles per page
copy_page function 'even_faster' took 9385 cycles per page
copy_page function 'no_prefetch' took 7637 cycles per page
$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 15802 cycles per page
copy_page function '2.4 non MMX' took 16957 cycles per page
copy_page function '2.4 MMX fallback' took 16897 cycles per page
copy_page function '2.4 MMX version' took 15854 cycles per page
copy_page function 'faster_copy' took 9564 cycles per page
copy_page function 'even_faster' took 9407 cycles per page
copy_page function 'no_prefetch' took 7649 cycles per page
$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 15807 cycles per page
copy_page function '2.4 non MMX' took 16995 cycles per page
copy_page function '2.4 MMX fallback' took 17056 cycles per page
copy_page function '2.4 MMX version' took 16431 cycles per page
copy_page function 'faster_copy' took 9832 cycles per page
copy_page function 'even_faster' took 9389 cycles per page
copy_page function 'no_prefetch' took 7657 cycles per page

2002-10-24 19:37:31

by Ken Witherow

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

[ken@death ken]$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) MP 1800+
stepping : 2
cpu MHz : 1533.408
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3060.53

processor : 1
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) Processor
stepping : 2
cpu MHz : 1533.408
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3060.53


Running on a Tyan S2460 (760MP chipset)

[ken@death ken]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16742 cycles per page
copy_page function '2.4 non MMX' took 18632 cycles per page
copy_page function '2.4 MMX fallback' took 18948 cycles per page
copy_page function '2.4 MMX version' took 16772 cycles per page
copy_page function 'faster_copy' took 10157 cycles per page
copy_page function 'even_faster' took 10406 cycles per page
copy_page function 'no_prefetch' took 8865 cycles per page
[ken@death ken]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16804 cycles per page
copy_page function '2.4 non MMX' took 18712 cycles per page
copy_page function '2.4 MMX fallback' took 18630 cycles per page
copy_page function '2.4 MMX version' took 16810 cycles per page
copy_page function 'faster_copy' took 10211 cycles per page
copy_page function 'even_faster' took 10462 cycles per page
copy_page function 'no_prefetch' took 8858 cycles per page
[ken@death ken]$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16877 cycles per page
copy_page function '2.4 non MMX' took 18692 cycles per page
copy_page function '2.4 MMX fallback' took 18557 cycles per page
copy_page function '2.4 MMX version' took 16763 cycles per page
copy_page function 'faster_copy' took 10206 cycles per page
copy_page function 'even_faster' took 10325 cycles per page
copy_page function 'no_prefetch' took 8892 cycles per page


--
Ken Witherow <phantoml AT rochester.rr.com>
ICQ: 21840670 AIM: phantomlordken
http://www.krwtech.com/ken


2002-10-24 19:54:11

by Panagiotis Papadakos

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


ASUS K7V 512 Mb PC-133 Athlon Slot-A 600 MhZ

bash-2.05# cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 2
model name : AMD Athlon(tm) Processor
stepping : 1
cpu MHz : 618.008
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 1232.07

bash-2.05# ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 10096 cycles per page
copy_page function '2.4 non MMX' took 14856 cycles per page
copy_page function '2.4 MMX fallback' took 14168 cycles per page
copy_page function '2.4 MMX version' took 10754 cycles per page
copy_page function 'faster_copy' took 5752 cycles per page
copy_page function 'even_faster' took 5694 cycles per page
copy_page function 'no_prefetch' took 6560 cycles per page

bash-2.05# ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 10851 cycles per page
copy_page function '2.4 non MMX' took 14726 cycles per page
copy_page function '2.4 MMX fallback' took 14390 cycles per page
copy_page function '2.4 MMX version' took 11390 cycles per page
copy_page function 'faster_copy' took 5490 cycles per page
copy_page function 'even_faster' took 5655 cycles per page
copy_page function 'no_prefetch' took 5906 cycles per page


bash-2.05# ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 9819 cycles per page
copy_page function '2.4 non MMX' took 14897 cycles per page
copy_page function '2.4 MMX fallback' took 15609 cycles per page
copy_page function '2.4 MMX version' took 10374 cycles per page
copy_page function 'faster_copy' took 5759 cycles per page
copy_page function 'even_faster' took 5609 cycles per page
copy_page function 'no_prefetch' took 6059 cycles per page


2002-10-24 20:07:46

by Robert Love

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, 2002-10-24 at 16:09, Ed Sweetman wrote:

> I seem to be seeing compiler optimizations come into play with the
> numbers and not any mention of them that i've seen has been talked
> about. That could be causing any discrepencies with predicted values. So
> not only would we have to look at algorithms, but also the compilers and
> what optimizations we plan on using them with. Some do better on
> certain compilers+flags than others. It's a mixmatch that seems to only
> get complicated the more realistic you make it.

The majority of the program is inline assembly so I do not think
compiler is playing a huge role here.

Regardless, the numbers are all pretty uniform in saying the new no
prefetch method is superior so its a mute point.

Robert Love

2002-10-24 20:03:14

by Ed Sweetman

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


I seem to be seeing compiler optimizations come into play with the
numbers and not any mention of them that i've seen has been talked
about. That could be causing any discrepencies with predicted values. So
not only would we have to look at algorithms, but also the compilers and
what optimizations we plan on using them with. Some do better on
certain compilers+flags than others. It's a mixmatch that seems to only
get complicated the more realistic you make it.



athlon tbird 1133.402Mhz, 133Mhz fsb, 512MB + 128MB pc133sdram No hd

Here are 3 averages of 3 runs each

gcc (GCC) 3.2.1 20021020 (Debian prerelease)

flags : -O3 -march=athlon-tbird -mcpu=athlon-tbird -falign-loops=4

copy_page function 'warm up run' took 17577 cycles per page
copy_page function '2.4 non MMX' took 23659 cycles per page
copy_page function '2.4 MMX fallback' took 23894 cycles per page
copy_page function '2.4 MMX version' took 17549 cycles per page
copy_page function 'faster_copy' took 10452 cycles per page
copy_page function 'even_faster' took 10159 cycles per page
copy_page function 'no_prefetch' took 9508 cycles per page

flags : -O0 -march=i686 -mcpu=i686

copy_page function 'warm up run' took 18377 cycles per page
copy_page function '2.4 non MMX' took 23688 cycles per page
copy_page function '2.4 MMX fallback' took 23671 cycles per page
copy_page function '2.4 MMX version' took 18407 cycles per page
copy_page function 'faster_copy' took 10091 cycles per page
copy_page function 'even_faster' took 10283 cycles per page
copy_page function 'no_prefetch' took 9907 cycles per page


gcc 2.95.4

flags : -O0 -march=i686 -mcpu=i686

copy_page function 'warm up run' took 18343 cycles per page
copy_page function '2.4 non MMX' took 23655 cycles per page
copy_page function '2.4 MMX fallback' took 23646 cycles per page
copy_page function '2.4 MMX version' took 18324 cycles per page
copy_page function 'faster_copy' took 10146 cycles per page
copy_page function 'even_faster' took 10438 cycles per page
copy_page function 'no_prefetch' took 9913 cycles per page


----------------------------------------------------------------------


avg difference due to compiler
copy_page function 'warm up run' took +-533 cycles per page
copy_page function '2.4 non MMX' took +-22 cycles per page
copy_page function '2.4 MMX fallback' took +-496 cycles per page
copy_page function '2.4 MMX version' took +-572 cycles per page
copy_page function 'faster_copy' took +-241 cycles per page
copy_page function 'even_faster' took +-186 cycles per page

Other options may give even greater differences but it's difficult to
try them all so I thought this should give an example of proof. The
options did not do better than all of the runs of another option but
instead did better and worse depending on the test.

copy_page function 'warm up run' test1
copy_page function '2.4 non MMX' test3
copy_page function '2.4 MMX fallback' test3
copy_page function '2.4 MMX version' test1
copy_page function 'faster_copy' test2
copy_page function 'even_faster' test1
copy_page function 'no_prefetch' test1


bandwidth test1 = 488.3MB/sec (not ddr like other setups)
bandwidth test2 = 468.6MB/sec
bandwidth test3 = 468.3MB/sec



2002-10-24 20:18:48

by Dave Jones

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 04:09:41PM -0400, Ed Sweetman wrote:
>
> I seem to be seeing compiler optimizations come into play with the
> numbers and not any mention of them that i've seen has been talked
> about. That could be causing any discrepencies with predicted values. So
> not only would we have to look at algorithms, but also the compilers and
> what optimizations we plan on using them with. Some do better on
> certain compilers+flags than others. It's a mixmatch that seems to only
> get complicated the more realistic you make it.

The functions being benchmarked are written in assembly.
gcc will not change these in any way, making compiler flags
or revision irrelevant.

Dave

--
| Dave Jones. http://www.codemonkey.org.uk

2002-10-24 20:25:02

by Ed Sweetman

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Robert Love wrote:
> On Thu, 2002-10-24 at 16:09, Ed Sweetman wrote:
>
>
>>I seem to be seeing compiler optimizations come into play with the
>>numbers and not any mention of them that i've seen has been talked
>>about. That could be causing any discrepencies with predicted values. So
>>not only would we have to look at algorithms, but also the compilers and
>>what optimizations we plan on using them with. Some do better on
>>certain compilers+flags than others. It's a mixmatch that seems to only
>>get complicated the more realistic you make it.
>
>
> The majority of the program is inline assembly so I do not think
> compiler is playing a huge role here.
>
> Regardless, the numbers are all pretty uniform in saying the new no
> prefetch method is superior so its a mute point.
>
> Robert Love

With gcc 3.x i get

495MB/s with -O3 -march=athlon-tbird -mcpu=athlon-tbird -falign-loops=4
-falign-functions=4

488MB/s with -O3 -march=athlon-tbird -mcpu=athlon-tbird -falign-loops=4

467MB/s with -O0 -march=i686 -mcpu=i686

which is almost a 30MB/s difference or 6% simply from compiler options
of the same compiler. It may not mean much in 1 second. But few things
where we care about performance are only run for one second.

I'd expect something below 3% and realistically closer to 1%. Any ideas
as to why it is making a difference? Does the execution path to the
function in C really take up performance to drop 30MB/s of memory
bandwidth because from the looks of it this program is very small and
things should be really quick to the asm functions.

2002-10-24 20:21:22

by Mike Civil

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

ABIT KT7A 896M PC133

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 4
model name : AMD Athlon(tm) processor
stepping : 4
cpu MHz : 1333.416
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 2660.76


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 23577 cycles per page
copy_page function '2.4 non MMX' took 30797 cycles per page
copy_page function '2.4 MMX fallback' took 30748 cycles per page
copy_page function '2.4 MMX version' took 23793 cycles per page
copy_page function 'faster_copy' took 13461 cycles per page
copy_page function 'even_faster' took 12599 cycles per page
copy_page function 'no_prefetch' took 11218 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 23122 cycles per page
copy_page function '2.4 non MMX' took 30279 cycles per page
copy_page function '2.4 MMX fallback' took 30452 cycles per page
copy_page function '2.4 MMX version' took 23152 cycles per page
copy_page function 'faster_copy' took 13367 cycles per page
copy_page function 'even_faster' took 12482 cycles per page
copy_page function 'no_prefetch' took 11146 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 22860 cycles per page
copy_page function '2.4 non MMX' took 30052 cycles per page
copy_page function '2.4 MMX fallback' took 30070 cycles per page
copy_page function '2.4 MMX version' took 22815 cycles per page
copy_page function 'faster_copy' took 13245 cycles per page
copy_page function 'even_faster' took 12393 cycles per page
copy_page function 'no_prefetch' took 11200 cycles per page

Mike

2002-10-24 20:45:26

by Dieter Nützel

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Rober Love wrote:
> The majority of the program is inline assembly so I do not think
> compiler is playing a huge role here.

I think they are...

> Regardless, the numbers are all pretty uniform in saying the new no
> prefetch method is superior so its a mute point.

But all "your" numbers are slow.
Look at mine with the "right" (TM) flags ;-)

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) MP 1900+
stepping : 2
cpu MHz : 1600.377
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mp mmxext 3dnowext 3dnow
bogomips : 3145.72

processor : 1
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) MP
stepping : 2
cpu MHz : 1600.377
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mp mmxext 3dnowext 3dnow
bogomips : 3194.88


SuSE Linux 7.3

glibc-2.2.4
Addons: db db2 linuxthreads noversion
Build CFLAGS: -O -mcpu=k6 -mpreferred-stack-boundary=2 -malign-functions=4
-fschedule-insns2 -fexpensive-optimizations -g
Build CC: gcc
Compiler version: 2.95.3 20010315 (SuSE)

Linux 2.5.43-mm2
Kernel compiler FLAGS
HOSTCC = gcc
HOSTCFLAGS = -Wall -Wstrict-prototypes -O -fomit-frame-pointer -mcpu=k6
-mpreferred-stack-boundary=2 -malign-functions=4 -fschedule-insns2
-fexpensive-optimizations

YES, I only use "-mcpu=k6" and "-O" for ages (since 26. August 1999 ;-) on my
Athlons.

nuetzel/Entwicklung> ./athlon ; ./athlon ; ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run' took 17409 cycles per page
clear_page function '2.4 non MMX' took 12340 cycles per page
clear_page function '2.4 MMX fallback' took 12429 cycles per page
clear_page function '2.4 MMX version' took 9794 cycles per page
clear_page function 'faster_clear_page' took 4639 cycles per page
clear_page function 'even_faster_clear' took 4914 cycles per page

copy_page() tests
copy_page function 'warm up run' took 16506 cycles per page
copy_page function '2.4 non MMX' took 18412 cycles per page
copy_page function '2.4 MMX fallback' took 18468 cycles per page
copy_page function '2.4 MMX version' took 16550 cycles per page
copy_page function 'faster_copy' took 10239 cycles per page
copy_page function 'even_faster' took 10816 cycles per page


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run' took 17148 cycles per page
clear_page function '2.4 non MMX' took 12426 cycles per page
clear_page function '2.4 MMX fallback' took 12330 cycles per page
clear_page function '2.4 MMX version' took 9776 cycles per page
clear_page function 'faster_clear_page' took 4619 cycles per page
clear_page function 'even_faster_clear' took 4938 cycles per page

copy_page() tests
copy_page function 'warm up run' took 16640 cycles per page
copy_page function '2.4 non MMX' took 18434 cycles per page
copy_page function '2.4 MMX fallback' took 18454 cycles per page
copy_page function '2.4 MMX version' took 16533 cycles per page
copy_page function 'faster_copy' took 10418 cycles per page
copy_page function 'even_faster' took 10707 cycles per page


Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
clear_page() tests
clear_page function 'warm up run' took 17475 cycles per page
clear_page function '2.4 non MMX' took 12435 cycles per page
clear_page function '2.4 MMX fallback' took 12379 cycles per page
clear_page function '2.4 MMX version' took 9902 cycles per page
clear_page function 'faster_clear_page' took 4665 cycles per page
clear_page function 'even_faster_clear' took 4947 cycles per page

copy_page() tests
copy_page function 'warm up run' took 16606 cycles per page
copy_page function '2.4 non MMX' took 18439 cycles per page
copy_page function '2.4 MMX fallback' took 18676 cycles per page
copy_page function '2.4 MMX version' took 16560 cycles per page
copy_page function 'faster_copy' took 10239 cycles per page
copy_page function 'even_faster' took 10728 cycles per page

nuetzel/Entwicklung> ./athlon2 ; ./athlon2 ; ./athlon2
1600.061 MHz
clear_page by 'normal_clear_page' took 12463 cycles (501.5 MB/s)
clear_page by 'slow_zero_page' took 12461 cycles (501.6 MB/s)
clear_page by 'fast_clear_page' took 9555 cycles (654.1 MB/s)
clear_page by 'faster_clear_page' took 4436 cycles (1408.7 MB/s)

copy_page by 'normal_copy_page' took 8992 cycles (695.0 MB/s)
copy_page by 'slow_copy_page' took 9010 cycles (693.7 MB/s)
copy_page by 'fast_copy_page' took 8134 cycles (768.3 MB/s)
copy_page by 'faster_copy' took 5546 cycles (1126.8 MB/s)
copy_page by 'even_faster' took 5616 cycles (1112.9 MB/s)


1600.057 MHz
clear_page by 'normal_clear_page' took 12555 cycles (497.8 MB/s)
clear_page by 'slow_zero_page' took 12740 cycles (490.6 MB/s)
clear_page by 'fast_clear_page' took 9783 cycles (638.8 MB/s)
clear_page by 'faster_clear_page' took 4459 cycles (1401.4 MB/s)

copy_page by 'normal_copy_page' took 9123 cycles (685.0 MB/s)
copy_page by 'slow_copy_page' took 9080 cycles (688.3 MB/s)
copy_page by 'fast_copy_page' took 8232 cycles (759.3 MB/s)
copy_page by 'faster_copy' took 5535 cycles (1129.1 MB/s)
copy_page by 'even_faster' took 5565 cycles (1123.1 MB/s)


1600.060 MHz
clear_page by 'normal_clear_page' took 12625 cycles (495.1 MB/s)
clear_page by 'slow_zero_page' took 12541 cycles (498.3 MB/s)
clear_page by 'fast_clear_page' took 9648 cycles (647.8 MB/s)
clear_page by 'faster_clear_page' took 4463 cycles (1400.2 MB/s)

copy_page by 'normal_copy_page' took 9178 cycles (680.9 MB/s)
copy_page by 'slow_copy_page' took 9011 cycles (693.6 MB/s)
copy_page by 'fast_copy_page' took 8138 cycles (768.0 MB/s)
copy_page by 'faster_copy' took 5508 cycles (1134.7 MB/s)
copy_page by 'even_faster' took 5552 cycles (1125.6 MB/s)

Regards,
Dieter
--
Dieter N?tzel
Graduate Student, Computer Science

University of Hamburg
Department of Computer Science
@home: Dieter.Nuetzel at hamburg.de (replace at with @)


Attachments:
(No filename) (6.74 kB)
athlon2.c.bz2 (2.84 kB)
Download all attachments

2002-10-24 20:41:25

by Dave Jones

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 04:31:31PM -0400, Ed Sweetman wrote:
> which is almost a 30MB/s difference or 6% simply from compiler options
> of the same compiler. It may not mean much in 1 second. But few things
> where we care about performance are only run for one second.

Looking at the assembly output of both optimised and unoptimised, we
see quite startling differences in the way the loops are done..
The unoptimised case..

movl $0, -12(%ebp)
.L75:
cmpl $63, -12(%ebp)
jle .L78
jmp .L76

...
movntq/movq inline asm bits
...
leal 12(%ebp), %eax
addl $64, (%eax)
addl $64, 8(%ebp)
leal -12(%ebp), %eax
incl (%eax)
jmp .L75

Note it uses -12(%ebp) to keep track of how much its copied.
The optimised version is much more sensible..

movl $63, %ebx
.p2align 2
.L98:
...
movntq/movq inline asm bits
...
addl $64, %ecx
addl $64, %edx
decl %ebx
jns .L98

Keeping track of the count in an register, no indirect memory references,
leaving the only memory references to be the actual memory copies, which
let it achieve the full bandwidth of the memory bus.

Quite surprising. I doubt going over the top with CFLAGS buys you much.
The above optimisation comes in with just -O2.

Dave

--
| Dave Jones. http://www.codemonkey.org.uk

2002-10-24 20:38:06

by Willy Tarreau

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Dual Athlon XP 1800+ on ASUS A7M266-D (760MPX), 512 MB of PC2100 in two identical banks.
I observed a noticeable slowdown several minutes later (after typing this mail),
see below.

willy@pcw:c$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16402 cycles per page
copy_page function '2.4 non MMX' took 17886 cycles per page
copy_page function '2.4 MMX fallback' took 17956 cycles per page
copy_page function '2.4 MMX version' took 16382 cycles per page
copy_page function 'faster_copy' took 9807 cycles per page
copy_page function 'even_faster' took 10205 cycles per page
copy_page function 'no_prefetch' took 8457 cycles per page
willy@pcw:c$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16552 cycles per page
copy_page function '2.4 non MMX' took 17744 cycles per page
copy_page function '2.4 MMX fallback' took 17713 cycles per page
copy_page function '2.4 MMX version' took 16427 cycles per page
copy_page function 'faster_copy' took 9823 cycles per page
copy_page function 'even_faster' took 10266 cycles per page
copy_page function 'no_prefetch' took 8451 cycles per page
willy@pcw:c$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16409 cycles per page
copy_page function '2.4 non MMX' took 17547 cycles per page
copy_page function '2.4 MMX fallback' took 17516 cycles per page
copy_page function '2.4 MMX version' took 16354 cycles per page
copy_page function 'faster_copy' took 9807 cycles per page
copy_page function 'even_faster' took 10219 cycles per page
copy_page function 'no_prefetch' took 8442 cycles per page

--- several minutes later ---

willy@pcw:c$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 18140 cycles per page
copy_page function '2.4 non MMX' took 20370 cycles per page
copy_page function '2.4 MMX fallback' took 20361 cycles per page
copy_page function '2.4 MMX version' took 18086 cycles per page
copy_page function 'faster_copy' took 10231 cycles per page
copy_page function 'even_faster' took 10457 cycles per page
copy_page function 'no_prefetch' took 8456 cycles per page

=> it seems that the memory areas have changed and that it is a bit
slower now. But as you can see, no_prefetch is stable. Only "common"
functions get slower.

So I tried to allocate hundreds of MB of RAM to swap a bit, then free it.
The results look better again :

willy@pcw:c$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16135 cycles per page
copy_page function '2.4 non MMX' took 17863 cycles per page
copy_page function '2.4 MMX fallback' took 17866 cycles per page
copy_page function '2.4 MMX version' took 16057 cycles per page
copy_page function 'faster_copy' took 9669 cycles per page
copy_page function 'even_faster' took 10176 cycles per page
copy_page function 'no_prefetch' took 8433 cycles per page

=> "common" implementations seem to really suffer from physical location.

Other data :
------------

willy@pcw:c$ cat /proc/pci
Bus 0, device 0, function 0:
Host bridge: Advanced Micro Devices [AMD] AMD-760 MP [IGD4-2P] System Controller (rev 17).
Master Capable. Latency=32.
Prefetchable 32 bit memory at 0xfc000000 [0xfdffffff].
Prefetchable 32 bit memory at 0xfb800000 [0xfb800fff].
I/O at 0xe800 [0xe803].

willy@pcw:c$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(TM) MP 1800+
stepping : 2
cpu MHz : 1546.000
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3080.19

processor : 1
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(TM) MP 1800+
stepping : 2
cpu MHz : 1546.000
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3086.74


Cheers,
Willy

2002-10-24 20:55:45

by Dieter Nützel

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Am Donnerstag, 24. Oktober 2002 22:51 schrieb Dieter N?tzel:
> Rober Love wrote:
> > The majority of the program is inline assembly so I do not think
> > compiler is playing a huge role here.
>
> I think they are...
>
> > Regardless, the numbers are all pretty uniform in saying the new no
> > prefetch method is superior so its a mute point.
>
> But all "your" numbers are slow.
> Look at mine with the "right" (TM) flags ;-)
>
> processor : 0
> vendor_id : AuthenticAMD
> cpu family : 6
> model : 6
> model name : AMD Athlon(tm) MP 1900+

Ups, lost something during cut'n paste:

dual Athlon MP 1900+
MSI MS-6501 Rev 1.0 (aka K7D Master-L), AMD 760MPX, BIOS 1.5
2x 512MB DDR266, CL2, unregistered, NO ECC (stinky "normal" stuff ;-)

Cheers,
Dieter

2002-10-24 21:10:13

by Willy Tarreau

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 10:51:26PM +0200, Dieter N?tzel wrote:
> copy_page by 'even_faster' took 5616 cycles (1112.9 MB/s)

something bothers me here : with PC2100 RAM, you copy 1113 MB/s, that
is 1113 MB in + 1113 MB out !

I tried your code and code somewhat same results (dual xp1800+, pc2100, 760MPX).
but I pasted the no_prefetch_copy_page() function into it and now it says that I
copy 1455 MB/s ! I didn't look deep through the code, but I suspect that there's
some static work that is not accounted, or a subtract between two counters, or
something like that.

Cheers,
Willy

2002-10-24 21:40:54

by Josh Fryman

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


several reports herein. first, machine specs. then, multiple compiler
outputs with different compiler versions. no real substantial variation
regardless of flags for best-case time.

machine is also loaded running services like web server, ssh sessions, etc.
not a heavy load, but may be a slight impact.

machine specs:
1.33 GHz Athlon (non-XP)
Asus A7V333 motherboard (Fast memory settings)
512 (2x256) MB DDR-SDRAM Crucial (Cas 2)


++++++++++++++
/proc/cpuinfo:
--------------

processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 4
model name : AMD Athlon(tm) Processor
stepping : 4
cpu MHz : 1332.992
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 mmx fxsr syscall mmxext 3dnowext 3dnow
bogomips : 2660.76


++++++++++
/proc/pci:
----------

PCI devices found:
Bus 0, device 0, function 0:
Host bridge: VIA Technologies, Inc. VT8367 [KT266] (rev 0).
Prefetchable 32 bit memory at 0xe0000000 [0xe7ffffff].
Bus 0, device 1, function 0:
PCI bridge: VIA Technologies, Inc. VT8367 [KT266 AGP] (rev 0).
Master Capable. No bursts. Min Gnt=8.
Bus 0, device 5, function 0:
Multimedia audio controller: C-Media Electronics Inc CM8738 (rev
16). IRQ 10.
Master Capable. Latency=32. Min Gnt=2.Max Lat=24.
I/O at 0xd800 [0xd8ff].
Bus 0, device 6, function 0:
RAID bus controller: Promise Technology, Inc. PDC20276 IDE (rev 1).
IRQ 5.
Master Capable. Latency=32. Min Gnt=4.Max Lat=18.
I/O at 0xd400 [0xd407].
I/O at 0xd000 [0xd003].
I/O at 0xb800 [0xb807].
I/O at 0xb400 [0xb403].
I/O at 0xb000 [0xb00f].
Non-prefetchable 32 bit memory at 0xdb800000 [0xdb803fff].
Bus 0, device 7, function 0:
FireWire (IEEE 1394): Texas Instruments TSB43AB21 IEEE-1394
Controller (PHY/Link) 1394a-2000 (rev 0). IRQ 10.
Master Capable. Latency=35. Min Gnt=2.Max Lat=4.
Non-prefetchable 32 bit memory at 0xdb000000 [0xdb0007ff].
Non-prefetchable 32 bit memory at 0xda800000 [0xda803fff].
Bus 0, device 9, function 0:
USB Controller: VIA Technologies, Inc. UHCI USB (rev 80).
IRQ 5.
Master Capable. Latency=32.
I/O at 0xa800 [0xa81f].
Bus 0, device 9, function 1:
USB Controller: VIA Technologies, Inc. UHCI USB (#2) (rev 80).
IRQ 11.
Master Capable. Latency=32.
I/O at 0xa400 [0xa41f].
Bus 0, device 17, function 2:
USB Controller: VIA Technologies, Inc. UHCI USB (#3) (rev 35).
IRQ 9.
Master Capable. Latency=32.
I/O at 0x8800 [0x881f].
Bus 0, device 17, function 3:
USB Controller: VIA Technologies, Inc. UHCI USB (#4) (rev 35).
IRQ 9.
Master Capable. Latency=32.
I/O at 0x8400 [0x841f].
Bus 0, device 9, function 2:
USB Controller: VIA Technologies, Inc. USB 2.0 (rev 81).
IRQ 10.
Master Capable. Latency=32.
Non-prefetchable 32 bit memory at 0xda000000 [0xda0000ff].
Bus 0, device 13, function 0:
Ethernet controller: Macronix, Inc. [MXIC] MX987x5 (rev 32).
IRQ 11.
Master Capable. Latency=32. Min Gnt=8.Max Lat=56.
I/O at 0xa000 [0xa0ff].
Non-prefetchable 32 bit memory at 0xd9800000 [0xd98000ff].
Bus 0, device 15, function 0:
Ethernet controller: Macronix, Inc. [MXIC] MX987x5 (#2) (rev 32).
IRQ 12.
Master Capable. Latency=32. Min Gnt=8.Max Lat=56.
I/O at 0x9400 [0x94ff].
Non-prefetchable 32 bit memory at 0xd8800000 [0xd88000ff].
Bus 0, device 14, function 0:
SCSI storage controller: Tekram Technology Co.,Ltd. TRM-S1040 (rev
1). IRQ 10.
Master Capable. Latency=32.
I/O at 0x9800 [0x98ff].
Non-prefetchable 32 bit memory at 0xd9000000 [0xd9000fff].
Bus 0, device 16, function 0:
Multimedia video controller: Brooktree Corporation Bt878 (rev 2).
IRQ 5.
Master Capable. Latency=32. Min Gnt=16.Max Lat=40.
Prefetchable 32 bit memory at 0xde000000 [0xde000fff].
Bus 0, device 16, function 1:
Multimedia controller: Brooktree Corporation Bt878 (rev 2).
IRQ 5.
Master Capable. Latency=32. Min Gnt=4.Max Lat=255.
Prefetchable 32 bit memory at 0xdd800000 [0xdd800fff].
Bus 0, device 17, function 0:
ISA bridge: PCI device 1106:3147 (VIA Technologies, Inc.) (rev 0).
Bus 0, device 17, function 1:
IDE interface: VIA Technologies, Inc. Bus Master IDE (rev 6).
Master Capable. Latency=32.
I/O at 0x9000 [0x900f].
Bus 1, device 0, function 0:
VGA compatible controller: nVidia Corporation Riva TnT [NV04] (rev
4). IRQ 11.
Master Capable. Latency=64. Min Gnt=5.Max Lat=1.
Non-prefetchable 32 bit memory at 0xdc000000 [0xdcffffff].
Prefetchable 32 bit memory at 0xdf000000 [0xdfffffff].

Default gcc is gcc 2.95.3:

chadh@goliath athlon $ gcc -v
Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/2.95.3/specs
gcc version 2.95.3 20010315 (release)

gcc-3.1 is gcc 3.1:

chadh@goliath athlon $ gcc-3.1 -v
Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/3.1/specs
Configured with: /var/tmp/portage/gcc-3.1-r7/work/gcc-3.1/configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --host=i686-pc-linux-gnu --build=i686-pc-linux-gnu --target=i686-pc-linux-gnu --enable-threads=posix --enable-long-long --enable-cstdio=stdio --enable-clocale=generic --disable-checking --with-gxx-include-dir=/usr/include/g++-v31 --with-local-prefix=/usr/local --with-system-zlib --enable-shared --enable-nls --without-included-gettext --program-suffix=-3.1
Thread model: posix
gcc version 3.1

Results:

gcc athlon.c
-----------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13448 cycles per page
copy_page function '2.4 non MMX' took 28448 cycles per page
copy_page function '2.4 MMX fallback' took 28420 cycles per page
copy_page function '2.4 MMX version' took 13446 cycles per page
copy_page function 'faster_copy' took 8163 cycles per page
copy_page function 'even_faster' took 8213 cycles per page
copy_page function 'no_prefetch' took 6472 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13434 cycles per page
copy_page function '2.4 non MMX' took 28435 cycles per page
copy_page function '2.4 MMX fallback' took 28453 cycles per page
copy_page function '2.4 MMX version' took 13361 cycles per page
copy_page function 'faster_copy' took 8118 cycles per page
copy_page function 'even_faster' took 8082 cycles per page
copy_page function 'no_prefetch' took 6448 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13393 cycles per page
copy_page function '2.4 non MMX' took 28392 cycles per page
copy_page function '2.4 MMX fallback' took 28148 cycles per page
copy_page function '2.4 MMX version' took 13419 cycles per page
copy_page function 'faster_copy' took 8110 cycles per page
copy_page function 'even_faster' took 8204 cycles per page
copy_page function 'no_prefetch' took 6454 cycles per page


++++++++++++++++
gcc -O3 athlon.c
----------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 14060 cycles per page
copy_page function '2.4 non MMX' took 28371 cycles per page
copy_page function '2.4 MMX fallback' took 28396 cycles per page
copy_page function '2.4 MMX version' took 13405 cycles per page
copy_page function 'faster_copy' took 8212 cycles per page
copy_page function 'even_faster' took 8494 cycles per page
copy_page function 'no_prefetch' took 6090 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13406 cycles per page
copy_page function '2.4 non MMX' took 28389 cycles per page
copy_page function '2.4 MMX fallback' took 28452 cycles per page
copy_page function '2.4 MMX version' took 13404 cycles per page
copy_page function 'faster_copy' took 8439 cycles per page
copy_page function 'even_faster' took 8260 cycles per page
copy_page function 'no_prefetch' took 6124 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13393 cycles per page
copy_page function '2.4 non MMX' took 28324 cycles per page
copy_page function '2.4 MMX fallback' took 28338 cycles per page
copy_page function '2.4 MMX version' took 13399 cycles per page
copy_page function 'faster_copy' took 8431 cycles per page
copy_page function 'even_faster' took 8126 cycles per page
copy_page function 'no_prefetch' took 6122 cycles per page


+++++++++++++++++++++++++++++++++++++++
gcc -O3 -march=i686 -mcpu=i686 athlon.c
---------------------------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13345 cycles per page
copy_page function '2.4 non MMX' took 28367 cycles per page
copy_page function '2.4 MMX fallback' took 28351 cycles per page
copy_page function '2.4 MMX version' took 13458 cycles per page
copy_page function 'faster_copy' took 8420 cycles per page
copy_page function 'even_faster' took 8260 cycles per page
copy_page function 'no_prefetch' took 6119 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13398 cycles per page
copy_page function '2.4 non MMX' took 28401 cycles per page
copy_page function '2.4 MMX fallback' took 28186 cycles per page
copy_page function '2.4 MMX version' took 14125 cycles per page
copy_page function 'faster_copy' took 8209 cycles per page
copy_page function 'even_faster' took 8306 cycles per page
copy_page function 'no_prefetch' took 6115 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13436 cycles per page
copy_page function '2.4 non MMX' took 28450 cycles per page
copy_page function '2.4 MMX fallback' took 28395 cycles per page
copy_page function '2.4 MMX version' took 13429 cycles per page
copy_page function 'faster_copy' took 8450 cycles per page
copy_page function 'even_faster' took 8283 cycles per page
copy_page function 'no_prefetch' took 6117 cycles per page


++++++++++++++++++++++++++++++++++
gcc -O3 -march=k6 mcpu=k6 athlon.c
----------------------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13369 cycles per page
copy_page function '2.4 non MMX' took 28292 cycles per page
copy_page function '2.4 MMX fallback' took 28058 cycles per page
copy_page function '2.4 MMX version' took 13381 cycles per page
copy_page function 'faster_copy' took 8461 cycles per page
copy_page function 'even_faster' took 8520 cycles per page
copy_page function 'no_prefetch' took 6113 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13414 cycles per page
copy_page function '2.4 non MMX' took 28120 cycles per page
copy_page function '2.4 MMX fallback' took 28994 cycles per page
copy_page function '2.4 MMX version' took 13391 cycles per page
copy_page function 'faster_copy' took 8238 cycles per page
copy_page function 'even_faster' took 8577 cycles per page
copy_page function 'no_prefetch' took 6136 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13489 cycles per page
copy_page function '2.4 non MMX' took 28185 cycles per page
copy_page function '2.4 MMX fallback' took 28417 cycles per page
copy_page function '2.4 MMX version' took 13464 cycles per page
copy_page function 'faster_copy' took 8277 cycles per page
copy_page function 'even_faster' took 8334 cycles per page
copy_page function 'no_prefetch' took 6132 cycles per page


++++++++++++++++
gcc-3.1 athlon.c
----------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13447 cycles per page
copy_page function '2.4 non MMX' took 28371 cycles per page
copy_page function '2.4 MMX fallback' took 28337 cycles per page
copy_page function '2.4 MMX version' took 13445 cycles per page
copy_page function 'faster_copy' took 8421 cycles per page
copy_page function 'even_faster' took 8535 cycles per page
copy_page function 'no_prefetch' took 6449 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13378 cycles per page
copy_page function '2.4 non MMX' took 28340 cycles per page
copy_page function '2.4 MMX fallback' took 28364 cycles per page
copy_page function '2.4 MMX version' took 13389 cycles per page
copy_page function 'faster_copy' took 8425 cycles per page
copy_page function 'even_faster' took 8498 cycles per page
copy_page function 'no_prefetch' took 6423 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13316 cycles per page
copy_page function '2.4 non MMX' took 28466 cycles per page
copy_page function '2.4 MMX fallback' took 28416 cycles per page
copy_page function '2.4 MMX version' took 13445 cycles per page
copy_page function 'faster_copy' took 8172 cycles per page
copy_page function 'even_faster' took 8322 cycles per page
copy_page function 'no_prefetch' took 6421 cycles per page


++++++++++++++++++++
gcc-3.1 -O3 athlon.c
--------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13452 cycles per page
copy_page function '2.4 non MMX' took 28625 cycles per page
copy_page function '2.4 MMX fallback' took 28431 cycles per page
copy_page function '2.4 MMX version' took 13459 cycles per page
copy_page function 'faster_copy' took 8225 cycles per page
copy_page function 'even_faster' took 8250 cycles per page
copy_page function 'no_prefetch' took 6174 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13327 cycles per page
copy_page function '2.4 non MMX' took 28407 cycles per page
copy_page function '2.4 MMX fallback' took 28433 cycles per page
copy_page function '2.4 MMX version' took 13422 cycles per page
copy_page function 'faster_copy' took 8214 cycles per page
copy_page function 'even_faster' took 8517 cycles per page
copy_page function 'no_prefetch' took 6182 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13473 cycles per page
copy_page function '2.4 non MMX' took 28443 cycles per page
copy_page function '2.4 MMX fallback' took 28472 cycles per page
copy_page function '2.4 MMX version' took 13444 cycles per page
copy_page function 'faster_copy' took 8077 cycles per page
copy_page function 'even_faster' took 8479 cycles per page
copy_page function 'no_prefetch' took 6192 cycles per page


+++++++++++++++++++++++++++++++++++++++++++
gcc-3.1 -O3 -march=i686 -mcpu=i686 athlon.c
-------------------------------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13424 cycles per page
copy_page function '2.4 non MMX' took 28320 cycles per page
copy_page function '2.4 MMX fallback' took 28360 cycles per page
copy_page function '2.4 MMX version' took 13308 cycles per page
copy_page function 'faster_copy' took 8437 cycles per page
copy_page function 'even_faster' took 8233 cycles per page
copy_page function 'no_prefetch' took 6132 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13414 cycles per page
copy_page function '2.4 non MMX' took 28406 cycles per page
copy_page function '2.4 MMX fallback' took 28379 cycles per page
copy_page function '2.4 MMX version' took 13397 cycles per page
copy_page function 'faster_copy' took 8202 cycles per page
copy_page function 'even_faster' took 8274 cycles per page
copy_page function 'no_prefetch' took 6182 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13361 cycles per page
copy_page function '2.4 non MMX' took 28395 cycles per page
copy_page function '2.4 MMX fallback' took 28371 cycles per page
copy_page function '2.4 MMX version' took 13416 cycles per page
copy_page function 'faster_copy' took 8271 cycles per page
copy_page function 'even_faster' took 8281 cycles per page
copy_page function 'no_prefetch' took 6186 cycles per page


++++++++++++++++++++++++++++++++++++++++++++++++
gcc-3.1 -O3 -march=athlon -mcpu=athlon athlon.c
------------------------------------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13408 cycles per page
copy_page function '2.4 non MMX' took 28380 cycles per page
copy_page function '2.4 MMX fallback' took 28357 cycles per page
copy_page function '2.4 MMX version' took 13380 cycles per page
copy_page function 'faster_copy' took 8442 cycles per page
copy_page function 'even_faster' took 8080 cycles per page
copy_page function 'no_prefetch' took 6179 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13429 cycles per page
copy_page function '2.4 non MMX' took 28376 cycles per page
copy_page function '2.4 MMX fallback' took 28360 cycles per page
copy_page function '2.4 MMX version' took 14140 cycles per page
copy_page function 'faster_copy' took 8342 cycles per page
copy_page function 'even_faster' took 8231 cycles per page
copy_page function 'no_prefetch' took 6121 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13417 cycles per page
copy_page function '2.4 non MMX' took 28408 cycles per page
copy_page function '2.4 MMX fallback' took 28397 cycles per page
copy_page function '2.4 MMX version' took 13403 cycles per page
copy_page function 'faster_copy' took 8217 cycles per page
copy_page function 'even_faster' took 8493 cycles per page
copy_page function 'no_prefetch' took 6226 cycles per page


+++++++++++++++++++++++++++++++++++++++++++++++++++
gcc-3.1 -O3 -march=athlon-4 -mcpu=athlon-4 athlon.c
----------------------------------------------------

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13371 cycles per page
copy_page function '2.4 non MMX' took 28983 cycles per page
copy_page function '2.4 MMX fallback' took 28330 cycles per page
copy_page function '2.4 MMX version' took 13038 cycles per page
copy_page function 'faster_copy' took 8437 cycles per page
copy_page function 'even_faster' took 8509 cycles per page
copy_page function 'no_prefetch' took 6178 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13471 cycles per page
copy_page function '2.4 non MMX' took 28421 cycles per page
copy_page function '2.4 MMX fallback' took 28413 cycles per page
copy_page function '2.4 MMX version' took 13463 cycles per page
copy_page function 'faster_copy' took 8195 cycles per page
copy_page function 'even_faster' took 8508 cycles per page
copy_page function 'no_prefetch' took 6038 cycles per page

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 13408 cycles per page
copy_page function '2.4 non MMX' took 28326 cycles per page
copy_page function '2.4 MMX fallback' took 28357 cycles per page
copy_page function '2.4 MMX version' took 13410 cycles per page
copy_page function 'faster_copy' took 8202 cycles per page
copy_page function 'even_faster' took 8488 cycles per page
copy_page function 'no_prefetch' took 6174 cycles per page

2002-10-24 21:55:06

by Harm Verhagen

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Athlon XP 1800+, VIA KT333, 256MB DDR2100

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16180 cycles per page
copy_page function '2.4 non MMX' took 17913 cycles per page
copy_page function '2.4 MMX fallback' took 18610 cycles per page
copy_page function '2.4 MMX version' took 16200 cycles per page
copy_page function 'faster_copy' took 9908 cycles per page
copy_page function 'even_faster' took 10117 cycles per page
copy_page function 'no_prefetch' took 6993 cycles per page
[harm@pchome memcpy2]$ ./memcpy
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16293 cycles per page
copy_page function '2.4 non MMX' took 17929 cycles per page
copy_page function '2.4 MMX fallback' took 18637 cycles per page
copy_page function '2.4 MMX version' took 16209 cycles per page
copy_page function 'faster_copy' took 9907 cycles per page
copy_page function 'even_faster' took 10122 cycles per page
copy_page function 'no_prefetch' took 6964 cycles per page

harm@pchome memcpy2]$ cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(TM) XP 1800+
stepping : 2
cpu MHz : 1532.941
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat
pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3060.53




2002-10-24 22:12:41

by Tim Schmielau

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

since everone seems to CC: lkml in the reply...

Athlon-500, AMD-751 Irongate, PC800-222 ECC SDRAM

> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 9998 cycles per page
copy_page function '2.4 non MMX' took 15269 cycles per page
copy_page function '2.4 MMX fallback' took 15192 cycles per page
copy_page function '2.4 MMX version' took 10152 cycles per page
copy_page function 'faster_copy' took 10264 cycles per page
copy_page function 'even_faster' took 10013 cycles per page
copy_page function 'no_prefetch' took 11527 cycles per page
> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 9975 cycles per page
copy_page function '2.4 non MMX' took 15513 cycles per page
copy_page function '2.4 MMX fallback' took 15219 cycles per page
copy_page function '2.4 MMX version' took 10009 cycles per page
copy_page function 'faster_copy' took 10186 cycles per page
copy_page function 'even_faster' took 10088 cycles per page
copy_page function 'no_prefetch' took 11583 cycles per page
> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 9967 cycles per page
copy_page function '2.4 non MMX' took 15178 cycles per page
copy_page function '2.4 MMX fallback' took 15178 cycles per page
copy_page function '2.4 MMX version' took 10086 cycles per page
copy_page function 'faster_copy' took 10124 cycles per page
copy_page function 'even_faster' took 10025 cycles per page
copy_page function 'no_prefetch' took 11524 cycles per page
> lspci
00:00.0 Host bridge: Advanced Micro Devices [AMD] AMD-751 [Irongate]
System Controller (rev 23)
00:01.0 PCI bridge: Advanced Micro Devices [AMD] AMD-751 [Irongate] AGP
Bridge (rev 01)
00:04.0 Ethernet controller: 3Com Corporation 3c905B 100BaseTX [Cyclone]
(rev 64)
00:07.0 ISA bridge: VIA Technologies, Inc. VT82C686 [Apollo Super South]
(rev 14)
00:07.1 IDE interface: VIA Technologies, Inc. Bus Master IDE (rev 06)
00:07.2 USB Controller: VIA Technologies, Inc. UHCI USB (rev 06)
00:07.3 USB Controller: VIA Technologies, Inc. UHCI USB (rev 06)
00:07.4 SMBus: VIA Technologies, Inc. VT82C686 [Apollo Super ACPI] (rev
10)
01:05.0 VGA compatible controller: S3 Inc. 86c368 [Trio 3D/2X] (rev 02)
> cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 1
model name : AMD-K7(tm) Processor
stepping : 2
cpu MHz : 499.051
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov
pat mmx syscall mmxext 3dnowext 3dnow
bogomips : 992.87


Tim

2002-10-24 23:11:01

by Hirokazu Takahashi

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Hello,

> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Your test dosen't use cache memory on CPU for the both of src and dst.
We should also try it with a smaller buffer.

2002-10-24 23:31:01

by Ryan Cumming

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

On October 24, 2002 10:15, Manfred Spraul wrote:
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Athlon XP 1800+, 512MB 133mhz SDRAM, Debian GCC 3.2.1 snapshot.

~$: gcc -march=athlon-xp -O2 athlon.c -o athlon
~$: ./athlon; ./athlon; ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 28161 cycles per page
copy_page function '2.4 non MMX' took 31398 cycles per page
copy_page function '2.4 MMX fallback' took 31442 cycles per page
copy_page function '2.4 MMX version' took 28130 cycles per page
copy_page function 'faster_copy' took 19112 cycles per page
copy_page function 'even_faster' took 17413 cycles per page
copy_page function 'no_prefetch' took 12708 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 28171 cycles per page
copy_page function '2.4 non MMX' took 31226 cycles per page
copy_page function '2.4 MMX fallback' took 31178 cycles per page
copy_page function '2.4 MMX version' took 28055 cycles per page
copy_page function 'faster_copy' took 17193 cycles per page
copy_page function 'even_faster' took 17287 cycles per page
copy_page function 'no_prefetch' took 12711 cycles per page
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 27955 cycles per page
copy_page function '2.4 non MMX' took 31069 cycles per page
copy_page function '2.4 MMX fallback' took 33483 cycles per page
copy_page function '2.4 MMX version' took 27917 cycles per page
copy_page function 'faster_copy' took 17120 cycles per page
copy_page function 'even_faster' took 17271 cycles per page
copy_page function 'no_prefetch' took 12712 cycles per page

- -Ryan
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.0 (GNU/Linux)

iD8DBQE9uIQlLGMzRzbJfbQRApi3AJ9+yQmGMk33Q7Ng1ze7jULIV+cEzACfQ79r
Q82U3yCZkppcUkr//3PXH+8=
=m1sv
-----END PGP SIGNATURE-----

2002-10-25 00:04:22

by Matthias Andree

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

-----BEGIN PGP SIGNED MESSAGE-----

On Thu, 24 Oct 2002, Manfred Spraul wrote:

> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.

Here you are, three runs, shown is: floor((n1+n2+n3)/3)
where n_i is the results (cycles/page) of the i-th run of the respective test

Duron 700, Via KT133, PC133 (manuf. date 12/2000): (machine was NOT idle)

'warm up run' 11445
'2.4 non MMX' 22374
'2.4 MMX fallback' 20075
'2.4 MMX version' 17611
'faster_copy' 11103
'even_faster' 7225
'no_prefetch' 6549

Athlon XP1600+, Via KT333, PC2100 (manuf. date 10/2002): (idle)

'warm up run' 14106
'2.4 non MMX' 15382
'2.4 MMX fallback' 15416
'2.4 MMX version' 14103
'faster_copy' 8793
'even_faster' 8797
'no_prefetch' 6351

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.0.7 (GNU/Linux)

iQCVAwUBPbiL5ydEoB0mv1ypAQEmpAP/UDkcx+UnItqTVcQzec3zDIUmrznZHkwa
0+cVgeMjg3e0QwIX85bmioicKlIw4WNz+AJZOyasFA+5VbxPBghkEkFOLzIzI9Bh
Eq2/uGNWrcwLfhIhsVgy0c/XgYLFoCY7mfH2oSs8+3TvIXIxhJoz7CsnaF+STk8e
wkWHfLN2+B0=
=4cF6
-----END PGP SIGNATURE-----

2002-10-25 08:30:27

by Luigi Genoni

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation


on a DUAL Athlon 2100+ (2GB RAM, kernel 2.4.19):

I get:

Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 21467 cycles per page
copy_page function '2.4 non MMX' took 24583 cycles per page
copy_page function '2.4 MMX fallback' took 24534 cycles per page
copy_page function '2.4 MMX version' took 21312 cycles per page
copy_page function 'faster_copy' took 12060 cycles per page
copy_page function 'even_faster' took 12152 cycles per page
copy_page function 'no_prefetch' took 10587 cycles per page


copy_page() tests
copy_page function 'warm up run' took 21458 cycles per page
copy_page function '2.4 non MMX' took 24592 cycles per page
copy_page function '2.4 MMX fallback' took 24585 cycles per page
copy_page function '2.4 MMX version' took 21334 cycles per page
copy_page function 'faster_copy' took 12090 cycles per page
copy_page function 'even_faster' took 12146 cycles per page
copy_page function 'no_prefetch' took 10583 cycles per page


copy_page() tests
copy_page function 'warm up run' took 21433 cycles per page
copy_page function '2.4 non MMX' took 24557 cycles per page
copy_page function '2.4 MMX fallback' took 24570 cycles per page
copy_page function '2.4 MMX version' took 21331 cycles per page
copy_page function 'faster_copy' took 12039 cycles per page
copy_page function 'even_faster' took 12126 cycles per page
copy_page function 'no_prefetch' took 10576 cycles per page



processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) Processor
stepping : 2
cpu MHz : 1733.362
cache size : 256 KB
Physical processor ID : 0
Number of siblings : 1
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3432.80

processor : 1
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(tm) Processor
stepping : 2
cpu MHz : 1733.362
cache size : 256 KB
Physical processor ID : 0
Number of siblings : 1
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 3466.21


CI devices found:
Bus 0, device 0, function 0:
Host bridge: Advanced Micro Devices [AMD] AMD-760 MP [IGD4-2P] System
Controller (rev 17).
Master Capable. Latency=64.
Prefetchable 32 bit memory at 0xf8000000 [0xfbffffff].
Prefetchable 32 bit memory at 0xf6200000 [0xf6200fff].
I/O at 0x1810 [0x1813].
Bus 0, device 1, function 0:
PCI bridge: Advanced Micro Devices [AMD] AMD-760 MP [IGD4-2P] AGP
Bridge (rev 0).
Master Capable. Latency=64. Min Gnt=4.
Bus 0, device 7, function 0:
ISA bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] ISA (rev 5).
Bus 0, device 7, function 1:
IDE interface: Advanced Micro Devices [AMD] AMD-768 [Opus] IDE (rev
4).
Master Capable. Latency=64.
I/O at 0xf000 [0xf00f].
Bus 0, device 7, function 3:
Bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] ACPI (rev 3).
Master Capable. Latency=64.
Bus 0, device 10, function 0:
SCSI storage controller: Adaptec AIC-7899P U160/m (rev 1).
IRQ 10.
Master Capable. Latency=72. Min Gnt=40.Max Lat=25.
I/O at 0x1000 [0x10ff].
Non-prefetchable 64 bit memory at 0xf4000000 [0xf4000fff].
Bus 0, device 10, function 1:
SCSI storage controller: Adaptec AIC-7899P U160/m (#2) (rev 1).
IRQ 11.
Master Capable. Latency=72. Min Gnt=40.Max Lat=25.
I/O at 0x1400 [0x14ff].
Non-prefetchable 64 bit memory at 0xf4001000 [0xf4001fff].
Bus 0, device 16, function 0:
PCI bridge: Advanced Micro Devices [AMD] AMD-768 [Opus] PCI (rev 5).
Master Capable. Latency=99. Min Gnt=12.
Bus 2, device 0, function 0:
USB Controller: Advanced Micro Devices [AMD] AMD-768 [Opus] USB (rev
7).
IRQ 10.
Master Capable. Latency=64. Max Lat=80.
Non-prefetchable 32 bit memory at 0xf4100000 [0xf4100fff].
Bus 2, device 7, function 0:
VGA compatible controller: ATI Technologies Inc Rage XL (rev 39).
Master Capable. Latency=66. Min Gnt=8.
Non-prefetchable 32 bit memory at 0xf5000000 [0xf5ffffff].
I/O at 0x2000 [0x20ff].
Non-prefetchable 32 bit memory at 0xf4101000 [0xf4101fff].
Bus 2, device 8, function 0:
Ethernet controller: 3Com Corporation 3c980-TX 10/100baseTX NIC
[Python-T] (rev 120).
IRQ 5.
Master Capable. Latency=80. Min Gnt=10.Max Lat=10.
I/O at 0x2400 [0x247f].
Non-prefetchable 32 bit memory at 0xf4102000 [0xf410207f].
Bus 2, device 9, function 0:
Ethernet controller: 3Com Corporation 3c980-TX 10/100baseTX NIC
[Python-T] (#2) (rev 120).
IRQ 9.
Master Capable. Latency=80. Min Gnt=10.Max Lat=10.
I/O at 0x2480 [0x24ff].
Non-prefetchable 32 bit memory at 0xf4102400 [0xf410247f].


2002-10-25 08:38:44

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On 24 October 2002 15:15, Manfred Spraul wrote:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy
> implementations. Could you run it and report the results to me,
> together with cpu, chipset and memory type?
>
> Please run 2 or 3 times.

There are the couple of wrinkles:

__asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
Wrong. Parameter is output. Remove one ':', replace "m" -> "=m".

__asm__ __volatile__ (
" femms\n" : :
);
What for? frstor will nuke out MMX state anyway.

static void fast_copy_page(void *to, void *from)
has two fsaves and no frstor ;)

"run three times" - can program do that on its own and find minimum?

I modified your test to be able to run it on Celeron.
(#defined out femms, replaced prefetch -> prefetchnta). Results are below.

I think it is impossible to make Best for all CPUs (tm) copy
function, we will newer know... maybe Hammer will do fastest copies
by rep movsb?

Btw, I used Arjan's program too, attaching my version...
Your method is indeed faster (see npf_copy.c).

We should avoid doing the same thing again and again...
There are several block ops in the kernel (memcpys,
csum_copy routines (see my other post)), can we
coordinate efforts to speed them up too?

FWIW, here's my results (Celeron 1200):
# ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function 'warm up run' took 42394 cycles per page
copy_page function '2.4 non MMX' took 41923 cycles per page
copy_page function '2.4 MMX fallback' took 41903 cycles per page
copy_page function '2.4 MMX version' took 43036 cycles per page
copy_page function 'faster_copy' took 28337 cycles per page
copy_page function 'even_faster' took 24632 cycles per page
copy_page function 'no_prefetch' took 23087 cycles per page
# ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function 'warm up run' took 45152 cycles per page
copy_page function '2.4 non MMX' took 44443 cycles per page
copy_page function '2.4 MMX fallback' took 45530 cycles per page
copy_page function '2.4 MMX version' took 45441 cycles per page
copy_page function 'faster_copy' took 29266 cycles per page
copy_page function 'even_faster' took 25849 cycles per page
copy_page function 'no_prefetch' took 24014 cycles per page
# ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
copy_page() tests
copy_page function 'warm up run' took 45850 cycles per page
copy_page function '2.4 non MMX' took 44603 cycles per page
copy_page function '2.4 MMX fallback' took 45631 cycles per page
copy_page function '2.4 MMX version' took 57267 cycles per page
copy_page function 'faster_copy' took 29628 cycles per page
copy_page function 'even_faster' took 25989 cycles per page
copy_page function 'no_prefetch' took 23987 cycles per page


Attachments:
timing_clear_copy.tar.bz2 (3.57 kB)

2002-10-25 09:13:06

by MånsRullgård

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Dave Jones <[email protected]> writes:

> The functions being benchmarked are written in assembly.
> gcc will not change these in any way, making compiler flags
> or revision irrelevant.

Doesn't gcc schedule inline assembly instructions?

--
M?ns Rullg?rd
[email protected]

2002-10-25 13:12:54

by Daniel Egger

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

Am Don, 2002-10-24 um 21.28 schrieb Manfred Spraul:

> It seems the via cpu doesn't support prefetchnta. Could you try the
> attached version?

egger@tanja:~$ ./via
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 24318 cycles per page
copy_page function '2.4 non MMX' took 35819 cycles per page
copy_page function '2.4 MMX fallback' took 35921 cycles per page
copy_page function '2.4 MMX version' took 24291 cycles per page
Illegal instruction

Unfortunately I have no space for gdb on it right now sow I cannot
easily debug where it crashes.

BTW: I did the same thing you did: Remove the calls to the obviously
offending calls to the "fast" versions. I've no idea why the no_prefetch
version doesn't, though...

--
Servus,
Daniel


Attachments:
signature.asc (189.00 B)
Dies ist ein digital signierter Nachrichtenteil

2002-10-25 16:27:59

by Koke

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

El jue, 24-10-2002 a las 19:15, Manfred Spraul escribi?:
> AMD recommends to perform memory copies with backward read operations
> instead of prefetch.
>
> http://208.15.46.63/events/gdc2002.htm
>
> Attached is a test app that compares several memory copy
implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?
>
> Please run 2 or 3 times.
>

My machine: Athlon 1600XP, 512MB DDR, KT226A??

koke@tuxland:~/src/testing$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 15408 cycles per page
copy_page function '2.4 non MMX' took 16805 cycles per page
copy_page function '2.4 MMX fallback' took 16695 cycles per page
copy_page function '2.4 MMX version' took 15424 cycles per page
copy_page function 'faster_copy' took 9481 cycles per page
copy_page function 'even_faster' took 9354 cycles per page
copy_page function 'no_prefetch' took 6635 cycles per page
koke@tuxland:~/src/testing$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 15418 cycles per page
copy_page function '2.4 non MMX' took 16792 cycles per page
copy_page function '2.4 MMX fallback' took 16754 cycles per page
copy_page function '2.4 MMX version' took 15495 cycles per page
copy_page function 'faster_copy' took 9426 cycles per page
copy_page function 'even_faster' took 9490 cycles per page
copy_page function 'no_prefetch' took 6591 cycles per page
koke@tuxland:~/src/testing$ ./athlon
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 16485 cycles per page
copy_page function '2.4 non MMX' took 16759 cycles per page
copy_page function '2.4 MMX fallback' took 16769 cycles per page
copy_page function '2.4 MMX version' took 15377 cycles per page
copy_page function 'faster_copy' took 9732 cycles per page
copy_page function 'even_faster' took 12125 cycles per page
copy_page function 'no_prefetch' took 9439 cycles per page
koke@tuxland:~/src/testing$


> --
> Manfred
> ----
>

> /*
>
> (C) 2000 Arjan van de Ven and others licensed under the terms of the
GPL
>
>
> $Revision: 1.6 $
> */
>
> static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp
$";
> #include <unistd.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
>
> /* The 2.4 kernel one, adapted for userspace */
>
> static void fast_clear_page(void *page)
> {
> int i;
> char fpu_save[108];
>
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> __asm__ __volatile__ (
> " pxor %%mm0, %%mm0\n" : :
> );
>
> for(i=0;i<4096/128;i++)
> {
> __asm__ __volatile__ (
> " movq %%mm0, (%0)\n"
> " movq %%mm0, 8(%0)\n"
> " movq %%mm0, 16(%0)\n"
> " movq %%mm0, 24(%0)\n"
> " movq %%mm0, 32(%0)\n"
> " movq %%mm0, 40(%0)\n"
> " movq %%mm0, 48(%0)\n"
> " movq %%mm0, 56(%0)\n"
> " movq %%mm0, 64(%0)\n"
> " movq %%mm0, 72(%0)\n"
> " movq %%mm0, 80(%0)\n"
> " movq %%mm0, 88(%0)\n"
> " movq %%mm0, 96(%0)\n"
> " movq %%mm0, 104(%0)\n"
> " movq %%mm0, 112(%0)\n"
> " movq %%mm0, 120(%0)\n"
> : : "r" (page) : "memory");
> page+=128;
> }
> __asm__ __volatile__ (
> " femms\n" : :
> );
> __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>
> }
>
> /* modified version for Athlon-family processors */
> static void faster_clear_page(void *page)
> {
> int i;
> char fpu_save[108];
>
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
> __asm__ __volatile__ (
> " pxor %%mm0, %%mm0\n" : :
> );
>
> for(i=0;i<4096/64;i++)
> {
> __asm__ __volatile__ (
> " movntq %%mm0, (%0)\n"
> " movntq %%mm0, 8(%0)\n"
> " movntq %%mm0, 16(%0)\n"
> " movntq %%mm0, 24(%0)\n"
> " movntq %%mm0, 32(%0)\n"
> " movntq %%mm0, 40(%0)\n"
> " movntq %%mm0, 48(%0)\n"
> " movntq %%mm0, 56(%0)\n"
> : : "r" (page) : "memory");
> page+=64;
> }
> __asm__ __volatile__ (
> " sfence \n "
> " femms\n" : :
> );
> __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>
> }
>
> /* test version to go even faster... this might be the same as faster_
> * but serves as my playground.
> */
> static void even_faster_clear_page(void *page)
> {
> int i;
> char fpu_save[108];
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> __asm__ __volatile__ (
> " pxor %%mm0, %%mm0\n" : :
> );
>
> for(i=0;i<4096/64;i++)
> {
> __asm__ __volatile__ (
> " movntq %%mm0, (%0)\n"
> " movntq %%mm0, 8(%0)\n"
> " movntq %%mm0, 16(%0)\n"
> " movntq %%mm0, 24(%0)\n"
> " movntq %%mm0, 32(%0)\n"
> " movntq %%mm0, 40(%0)\n"
> " movntq %%mm0, 48(%0)\n"
> " movntq %%mm0, 56(%0)\n"
> : : "r" (page) : "memory");
> page+=64;
> }
> __asm__ __volatile__ (
> " sfence \n "
> " femms\n" : :
> );
> __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>
> }
>
> /* The "fallback" one as used by the kernel */
> static void slow_zero_page(void * page)
> {
> int d0, d1;
> __asm__ __volatile__( \
> "cld\n\t" \
> "rep ; stosl" \
> : "=&c" (d0), "=&D" (d1)
> :"a" (0),"1" (page),"0" (1024)
> :"memory");
> }
>
> static void slow_copy_page(void *to, void *from)
> {
> int d0, d1, d2;
> __asm__ __volatile__( \
> "cld\n\t" \
> "rep ; movsl" \
> : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
> : "0" (1024),"1" ((long) to),"2" ((long) from) \
> : "memory");
> }
>
>
> /* 2.4 kernel mmx copy_page function */
> static void fast_copy_page(void *to, void *from)
> {
> int i;
> char fpu_save[108];
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> __asm__ __volatile__ (
> "1: prefetch (%0)\n"
> " prefetch 64(%0)\n"
> " prefetch 128(%0)\n"
> " prefetch 192(%0)\n"
> " prefetch 256(%0)\n"
> : : "r" (from) );
>
> for(i=0; i<4096/64; i++)
> {
> __asm__ __volatile__ (
> "1: prefetch 320(%0)\n"
> "2: movq (%0), %%mm0\n"
> " movq 8(%0), %%mm1\n"
> " movq 16(%0), %%mm2\n"
> " movq 24(%0), %%mm3\n"
> " movq %%mm0, (%1)\n"
> " movq %%mm1, 8(%1)\n"
> " movq %%mm2, 16(%1)\n"
> " movq %%mm3, 24(%1)\n"
> " movq 32(%0), %%mm0\n"
> " movq 40(%0), %%mm1\n"
> " movq 48(%0), %%mm2\n"
> " movq 56(%0), %%mm3\n"
> " movq %%mm0, 32(%1)\n"
> " movq %%mm1, 40(%1)\n"
> " movq %%mm2, 48(%1)\n"
> " movq %%mm3, 56(%1)\n"
> : : "r" (from), "r" (to) : "memory");
> from+=64;
> to+=64;
> }
> __asm__ __volatile__ (
> " femms\n" : :
> );
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> }
>
>
> /* Athlon improved version */
> static void faster_copy_page(void *to, void *from)
> {
> int i;
> char fpu_save[108];
>
> __asm__ __volatile__ (
> "1: prefetchnta (%0)\n"
> " prefetchnta 64(%0)\n"
> " prefetchnta 128(%0)\n"
> " prefetchnta 192(%0)\n"
> : : "r" (from) );
>
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> for(i=0; i<4096/64; i++)
> {
> __asm__ __volatile__ (
> "1: prefetchnta 320(%0)\n"
> "2: movq (%0), %%mm0\n"
> " movq 8(%0), %%mm1\n"
> " movq 16(%0), %%mm2\n"
> " movq 24(%0), %%mm3\n"
> " movq 32(%0), %%mm4\n"
> " movq 40(%0), %%mm5\n"
> " movq 48(%0), %%mm6\n"
> " movq 56(%0), %%mm7\n"
> " movntq %%mm0, (%1)\n"
> " movntq %%mm1, 8(%1)\n"
> " movntq %%mm2, 16(%1)\n"
> " movntq %%mm3, 24(%1)\n"
> " movntq %%mm4, 32(%1)\n"
> " movntq %%mm5, 40(%1)\n"
> " movntq %%mm6, 48(%1)\n"
> " movntq %%mm7, 56(%1)\n"
> : : "r" (from), "r" (to) : "memory");
> from+=64;
> to+=64;
> }
> __asm__ __volatile__ (
> " femms \n "
> " sfence\n" : :
> );
> __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>
> }
>
> /* test version to go even faster... this might be the same as faster_
> * but serves as my playground.
> */
> static void even_faster_copy_page(void *to, void *from)
> {
> int i;
> char fpu_save[108];
>
> __asm__ __volatile__ (
> "1: prefetchnta (%0)\n"
> " prefetchnta 64(%0)\n"
> " prefetchnta 128(%0)\n"
> " prefetchnta 192(%0)\n"
> : : "r" (from) );
>
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) );
>
> for(i=0; i<4096/64; i++)
> {
> __asm__ __volatile__ (
> " prefetchnta 256(%0)\n"
> " movq (%0), %%mm0\n"
> " movntq %%mm0, (%1)\n"
> " movq 8(%0), %%mm1\n"
> " movntq %%mm1, 8(%1)\n"
> " movq 16(%0), %%mm2\n"
> " movntq %%mm2, 16(%1)\n"
> " movq 24(%0), %%mm3\n"
> " movntq %%mm3, 24(%1)\n"
> " movq 32(%0), %%mm4\n"
> " movntq %%mm4, 32(%1)\n"
> " movq 40(%0), %%mm5\n"
> " movntq %%mm5, 40(%1)\n"
> " movq 48(%0), %%mm6\n"
> " movntq %%mm6, 48(%1)\n"
> " movq 56(%0), %%mm7\n"
> " movntq %%mm7, 56(%1)\n"
> : : "r" (from), "r" (to) : "memory");
> from+=64;
> to+=64;
> }
> __asm__ __volatile__ (
> " femms \n "
> " sfence\n" : :
> );
> __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) );
>
> }
>
>
> /*
> * This looks horribly ugly, but the compiler can optimize it totally,
> * as the count is constant.
> */
> static inline void * __constant_memcpy(void * to, const void * from,
size_t n)
> {
> switch (n) {
> case 0:
> return to;
> case 1:
> *(unsigned char *)to = *(const unsigned char
*)from;
> return to;
> case 2:
> *(unsigned short *)to = *(const unsigned short
*)from;
> return to;
> case 3:
> *(unsigned short *)to = *(const unsigned short
*)from;
> *(2+(unsigned char *)to) = *(2+(const unsigned
char *)from);
> return to;
> case 4:
> *(unsigned long *)to = *(const unsigned long
*)from;
> return to;
> case 6: /* for Ethernet addresses */
> *(unsigned long *)to = *(const unsigned long
*)from;
> *(2+(unsigned short *)to) = *(2+(const unsigned
short *)from);
> return to;
> case 8:
> *(unsigned long *)to = *(const unsigned long
*)from;
> *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
> return to;
> case 12:
> *(unsigned long *)to = *(const unsigned long
*)from;
> *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
> *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
> return to;
> case 16:
> *(unsigned long *)to = *(const unsigned long
*)from;
> *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
> *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
> *(3+(unsigned long *)to) = *(3+(const unsigned
long *)from);
> return to;
> case 20:
> *(unsigned long *)to = *(const unsigned long
*)from;
> *(1+(unsigned long *)to) = *(1+(const unsigned
long *)from);
> *(2+(unsigned long *)to) = *(2+(const unsigned
long *)from);
> *(3+(unsigned long *)to) = *(3+(const unsigned
long *)from);
> *(4+(unsigned long *)to) = *(4+(const unsigned
long *)from);
> return to;
> }
> #define COMMON(x) \
> __asm__ __volatile__( \
> "rep ; movsl" \
> x \
> : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
> : "0" (n/4),"1" ((long) to),"2" ((long) from) \
> : "memory");
> {
> int d0, d1, d2;
> switch (n % 4) {
> case 0: COMMON(""); return to;
> case 1: COMMON("\n\tmovsb"); return to;
> case 2: COMMON("\n\tmovsw"); return to;
> default: COMMON("\n\tmovsw\n\tmovsb"); return to;
> }
> }
>
> #undef COMMON
> }
>
>
> static void normal_copy_page(void *to, void *from)
> {
> __constant_memcpy(to,from,4096);
> }
>
>
> /*
> * This looks horribly ugly, but the compiler can optimize it totally,
> * as we by now know that both pattern and count is constant..
> */
> static inline void * __constant_c_and_count_memset(void * s, unsigned
long pattern, size_t count)
> {
> switch (count) {
> case 0:
> return s;
> case 1:
> *(unsigned char *)s = pattern;
> return s;
> case 2:
> *(unsigned short *)s = pattern;
> return s;
> case 3:
> *(unsigned short *)s = pattern;
> *(2+(unsigned char *)s) = pattern;
> return s;
> case 4:
> *(unsigned long *)s = pattern;
> return s;
> }
> #define COMMON(x) \
> __asm__ __volatile__( \
> "rep ; stosl" \
> x \
> : "=&c" (d0), "=&D" (d1) \
> : "a" (pattern),"0" (count/4),"1" ((long) s) \
> : "memory")
> {
> int d0, d1;
> switch (count % 4) {
> case 0: COMMON(""); return s;
> case 1: COMMON("\n\tstosb"); return s;
> case 2: COMMON("\n\tstosw"); return s;
> default: COMMON("\n\tstosw\n\tstosb"); return s;
> }
> }
>
> #undef COMMON
> }
>
> static void normal_clear_page(void *to)
> {
> __constant_c_and_count_memset(to,0,4096);
> }
>
> /* test version to see if we can go even faster */
> static void no_prefetch_copy_page(void *to, void *from) {
> int i, d1;
> char fpu_save[108];
>
> for (i=4096-256;i>=0;i-=256)
> __asm__ __volatile(
> "movl 192(%1,%2),%0\n"
> "movl 128(%1,%2),%0\n"
> "movl 64(%1,%2),%0\n"
> "movl 0(%1,%2),%0\n"
> : "=&r" (d1)
> : "r" (from), "r" (i));
>
> __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0])
);
>
> for(i=0; i<4096/64; i++) {
> __asm__ __volatile__ (
> " movq (%0), %%mm0\n"
> " movntq %%mm0, (%1)\n"
> " movq 8(%0), %%mm1\n"
> " movntq %%mm1, 8(%1)\n"
> " movq 16(%0), %%mm2\n"
> " movntq %%mm2, 16(%1)\n"
> " movq 24(%0), %%mm3\n"
> " movntq %%mm3, 24(%1)\n"
> " movq 32(%0), %%mm4\n"
> " movntq %%mm4, 32(%1)\n"
> " movq 40(%0), %%mm5\n"
> " movntq %%mm5, 40(%1)\n"
> " movq 48(%0), %%mm6\n"
> " movntq %%mm6, 48(%1)\n"
> " movq 56(%0), %%mm7\n"
> " movntq %%mm7, 56(%1)\n"
> : : "r" (from), "r" (to) : "memory");
> from+=64;
> to+=64;
> }
> __asm__ __volatile__ (
> " sfence \n "
> " emms\n"
> " frstor %0;\n" ::"m"(fpu_save[0]) );
> }
>
>
> #define rdtsc(low,high) \
> __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
>
> typedef void (clear_func)(void *);
> typedef void (copy_func)(void *,void *);
>
> void test_one_clearpage(clear_func *func, char *name, char *Buffer)
> {
> char *temp;
> int i;
> unsigned int blow,bhigh,alow,ahigh;
> unsigned long long before,after;
>
> rdtsc(blow,bhigh);
> temp = Buffer;
> for (i=0;i<4*1024;i++) {
> func(temp);
> temp += 4096;
> }
> rdtsc(alow,ahigh);
> before = blow + (((long long)bhigh)<<32);
> after = alow +(((long long)ahigh)<<32);
> if (before>after) {
> printf("test invalid; timer overflow \n");
> return;
> }
> printf("clear_page function '%s'\t took %4lli cycles per
page\n",name,(after-before)/(4*1024) );
>
>
> }
>
> void test_one_copypage(copy_func *func, char *name, char *Buffer)
> {
> char *temp;
> int i;
> unsigned int blow,bhigh,alow,ahigh;
> unsigned long long before,after;
>
> sleep(1);
> rdtsc(blow,bhigh);
> temp = Buffer;
> for (i=0;i<2*1024;i++) {
> func(temp,temp+8*1024*1024);
> temp += 4096;
> }
> rdtsc(alow,ahigh);
> before = blow+ (((long long)bhigh)<<32);
> after = alow+(((long long)ahigh)<<32);
> if (before>after) {
> printf("test invalid; timer overflow \n");
> return;
> }
> printf("copy_page function '%s'\t took %4lli cycles per
page\n",name,(after-before)/(2*1024) );
>
>
> }
>
>
> void test_clearpage(char *Buffer)
> {
> printf("clear_page() tests \n");
>
> test_one_clearpage(fast_clear_page,"warm up run",Buffer);
> test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer);
> test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer);
> test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer);
>
test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer);
>
test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer);
> }
>
> void test_copypage(char *Buffer)
> {
> printf("copy_page() tests \n");
>
> test_one_copypage(fast_copy_page, "warm up run",Buffer);
> test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer);
> test_one_copypage(slow_copy_page, "2.4 MMX fallback",Buffer);
> test_one_copypage(fast_copy_page, "2.4 MMX version",Buffer);
> test_one_copypage(faster_copy_page,"faster_copy",Buffer);
> test_one_copypage(even_faster_copy_page,"even_faster",Buffer);
> test_one_copypage(no_prefetch_copy_page,"no_prefetch",Buffer);
> }
>
> int main()
> {
> char *Buffer;
>
> Buffer = malloc(1024*1024*16);
> memset(Buffer,0xfe,1024*1024*16);
>
> printf("Athlon test program %s \n",cvsid);
>
> printf("\n");
> test_copypage(Buffer);
>
> free(Buffer);
>
> return 0;
> }
--
Jorge Bernal (Koke)
The software required Win95 or better, so I installed Linux
ICQ#: 63593654
MSN: koke_jb


Attachments:
signature.asc (232.00 B)
Esta parte del mensaje esta firmada digitalmente

2002-10-26 12:04:55

by Jurjen Oskam

[permalink] [raw]
Subject: Re: [CFT] faster athlon/duron memory copy implementation

On Thu, Oct 24, 2002 at 07:15:43PM +0200, Manfred Spraul wrote:

> Attached is a test app that compares several memory copy implementations.
> Could you run it and report the results to me, together with cpu,
> chipset and memory type?

joskam@hobbes:~> gcc -v
Reading specs from /usr/lib/gcc-lib/i486-suse-linux/2.95.3/specs
gcc version 2.95.3 20010315 (SuSE)
joskam@hobbes:~> gcc athlon.c
joskam@hobbes:~> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 19425 cycles per page
copy_page function '2.4 non MMX' took 22285 cycles per page
copy_page function '2.4 MMX fallback' took 21698 cycles per page
copy_page function '2.4 MMX version' took 19587 cycles per page
copy_page function 'faster_copy' took 11082 cycles per page
copy_page function 'even_faster' took 11203 cycles per page
copy_page function 'no_prefetch' took 7140 cycles per page
joskam@hobbes:~> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 19463 cycles per page
copy_page function '2.4 non MMX' took 22378 cycles per page
copy_page function '2.4 MMX fallback' took 21863 cycles per page
copy_page function '2.4 MMX version' took 19558 cycles per page
copy_page function 'faster_copy' took 11036 cycles per page
copy_page function 'even_faster' took 11292 cycles per page
copy_page function 'no_prefetch' took 7152 cycles per page
joskam@hobbes:~> ./a.out
Athlon test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $

copy_page() tests
copy_page function 'warm up run' took 21519 cycles per page
copy_page function '2.4 non MMX' took 22224 cycles per page
copy_page function '2.4 MMX fallback' took 21707 cycles per page
copy_page function '2.4 MMX version' took 19399 cycles per page
copy_page function 'faster_copy' took 11002 cycles per page
copy_page function 'even_faster' took 11211 cycles per page
copy_page function 'no_prefetch' took 7147 cycles per page
joskam@hobbes:~> cat /proc/cpuinfo
processor : 0
vendor_id : AuthenticAMD
cpu family : 6
model : 6
model name : AMD Athlon(TM) XP1700+
stepping : 2
cpu MHz : 1477.400
cache size : 256 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 sep mtrr pge mca cmov
pat pse36 mmx fxsr sse syscall mmxext 3dnowext 3dnow
bogomips : 2949.12

This was run on an Asus A7V266-E motherboard with a KT266A chipset, with
512 MB of DDR SDRAM.


--
Jurjen Oskam

PGP Key available at http://www.stupendous.org/