This consists mainly of the optimized copy routine for PIII/P4.
It is basically identical to what was introduced in 2.5.45.
However,
I tweaked a bit. I deleted accessing ‘movsl_mask’ every time, which
is unlikely to change during the run time.
If anybody finds a better copy routine, don’t have to touch uaccess.h.
just edit here.
Akira Tsukamoto
diff -Nur -X dontdiff linux-2.5.45/include/asm-i386/uaccess-intel.h linux-2.5.45-aki/include/asm-i386/uaccess-intel.h
--- linux-2.5.45/include/asm-i386/uaccess-intel.h Wed Dec 31 19:00:00 1969
+++ linux-2.5.45-aki/include/asm-i386/uaccess-intel.h Sat Nov 2 03:00:45 2002
@@ -0,0 +1,294 @@
+#ifndef __i386_UACCESS_INTEL_H
+#define __i386_UACCESS_INTEL_H
+/*
+ * PentiumIII/Pentium4 Copy To/From Userspace, taka version.
+ *
+ * Split into CPU specific files by Akira Tsukamoto to keep #ifdef noise down.
+ */
+
+#define MOVSL_MASK 7
+static inline int
+is_rep_movsl_faster(const void *a1, const void *a2, unsigned long n)
+{
+ if (n >= 64 && (((const long)a1 ^ (const long)a2) & MOVSL_MASK))
+ return 0;
+ return 1;
+}
+
+/* Using rep; movsl. */
+#define __copy_user_rep_movsl(to,from,size) \
+do { \
+ int __d0, __d1, __d2; \
+ __asm__ __volatile__( \
+ " cmp $7,%0\n" \
+ " jbe 1f\n" \
+ " movl %1,%0\n" \
+ " negl %0\n" \
+ " andl $7,%0\n" \
+ " subl %0,%3\n" \
+ "4: rep; movsb\n" \
+ " movl %3,%0\n" \
+ " shrl $2,%0\n" \
+ " andl $3,%3\n" \
+ " .align 2,0x90\n" \
+ "0: rep; movsl\n" \
+ " movl %3,%0\n" \
+ "1: rep; movsb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "5: addl %3,%0\n" \
+ " jmp 2b\n" \
+ "3: lea 0(%3,%0,4),%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 4b,5b\n" \
+ " .long 0b,3b\n" \
+ " .long 1b,2b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
+ : "3"(size), "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+#define __copy_user_zeroing_rep_movsl(to,from,size) \
+do { \
+ int __d0, __d1, __d2; \
+ __asm__ __volatile__( \
+ " cmp $7,%0\n" \
+ " jbe 1f\n" \
+ " movl %1,%0\n" \
+ " negl %0\n" \
+ " andl $7,%0\n" \
+ " subl %0,%3\n" \
+ "4: rep; movsb\n" \
+ " movl %3,%0\n" \
+ " shrl $2,%0\n" \
+ " andl $3,%3\n" \
+ " .align 2,0x90\n" \
+ "0: rep; movsl\n" \
+ " movl %3,%0\n" \
+ "1: rep; movsb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "5: addl %3,%0\n" \
+ " jmp 6f\n" \
+ "3: lea 0(%3,%0,4),%0\n" \
+ "6: pushl %0\n" \
+ " pushl %%eax\n" \
+ " xorl %%eax,%%eax\n" \
+ " rep; stosb\n" \
+ " popl %%eax\n" \
+ " popl %0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 4b,5b\n" \
+ " .long 0b,3b\n" \
+ " .long 1b,6b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
+ : "3"(size), "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+
+/* Using bulk movl. */
+#define __copy_user_movl(to,from,size) \
+do { \
+ int d0, d1; \
+ __asm__ __volatile__( \
+ " .align 2,0x90\n" \
+ "0: movl 32(%4), %%eax\n" \
+ " cmpl $67, %0\n" \
+ " jbe 1f\n" \
+ " movl 64(%4), %%eax\n" \
+ " .align 2,0x90\n" \
+ "1: movl 0(%4), %%eax\n" \
+ " movl 4(%4), %%edx\n" \
+ "2: movl %%eax, 0(%3)\n" \
+ "21: movl %%edx, 4(%3)\n" \
+ " movl 8(%4), %%eax\n" \
+ " movl 12(%4),%%edx\n" \
+ "3: movl %%eax, 8(%3)\n" \
+ "31: movl %%edx, 12(%3)\n" \
+ " movl 16(%4), %%eax\n" \
+ " movl 20(%4), %%edx\n" \
+ "4: movl %%eax, 16(%3)\n" \
+ "41: movl %%edx, 20(%3)\n" \
+ " movl 24(%4), %%eax\n" \
+ " movl 28(%4), %%edx\n" \
+ "10: movl %%eax, 24(%3)\n" \
+ "51: movl %%edx, 28(%3)\n" \
+ " movl 32(%4), %%eax\n" \
+ " movl 36(%4), %%edx\n" \
+ "11: movl %%eax, 32(%3)\n" \
+ "61: movl %%edx, 36(%3)\n" \
+ " movl 40(%4), %%eax\n" \
+ " movl 44(%4), %%edx\n" \
+ "12: movl %%eax, 40(%3)\n" \
+ "71: movl %%edx, 44(%3)\n" \
+ " movl 48(%4), %%eax\n" \
+ " movl 52(%4), %%edx\n" \
+ "13: movl %%eax, 48(%3)\n" \
+ "81: movl %%edx, 52(%3)\n" \
+ " movl 56(%4), %%eax\n" \
+ " movl 60(%4), %%edx\n" \
+ "14: movl %%eax, 56(%3)\n" \
+ "91: movl %%edx, 60(%3)\n" \
+ " addl $-64, %0\n" \
+ " addl $64, %4\n" \
+ " addl $64, %3\n" \
+ " cmpl $63, %0\n" \
+ " ja 0b\n" \
+ "5: movl %0, %%eax\n" \
+ " shrl $2, %0\n" \
+ " andl $3, %%eax\n" \
+ " cld\n" \
+ "6: rep; movsl\n" \
+ " movl %%eax, %0\n" \
+ "7: rep; movsb\n" \
+ "8:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "9: lea 0(%%eax,%0,4),%0\n" \
+ " jmp 8b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 2b,8b\n" \
+ " .long 21b,8b\n" \
+ " .long 3b,8b\n" \
+ " .long 31b,8b\n" \
+ " .long 4b,8b\n" \
+ " .long 41b,8b\n" \
+ " .long 10b,8b\n" \
+ " .long 51b,8b\n" \
+ " .long 11b,8b\n" \
+ " .long 61b,8b\n" \
+ " .long 12b,8b\n" \
+ " .long 71b,8b\n" \
+ " .long 13b,8b\n" \
+ " .long 81b,8b\n" \
+ " .long 14b,8b\n" \
+ " .long 91b,8b\n" \
+ " .long 6b,9b\n" \
+ " .long 7b,8b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (d0), "=&S" (d1) \
+ : "1"(to), "2"(from), "0"(size) \
+ : "eax", "edx", "memory"); \
+} while (0)
+
+#define __copy_user_zeroing_movl(to,from,size) \
+do { \
+ int d0, d1; \
+ __asm__ __volatile__( \
+ " .align 2,0x90\n" \
+ "0: movl 32(%4), %%eax\n" \
+ " cmpl $67, %0\n" \
+ " jbe 2f\n" \
+ "1: movl 64(%4), %%eax\n" \
+ " .align 2,0x90\n" \
+ "2: movl 0(%4), %%eax\n" \
+ "21: movl 4(%4), %%edx\n" \
+ " movl %%eax, 0(%3)\n" \
+ " movl %%edx, 4(%3)\n" \
+ "3: movl 8(%4), %%eax\n" \
+ "31: movl 12(%4),%%edx\n" \
+ " movl %%eax, 8(%3)\n" \
+ " movl %%edx, 12(%3)\n" \
+ "4: movl 16(%4), %%eax\n" \
+ "41: movl 20(%4), %%edx\n" \
+ " movl %%eax, 16(%3)\n" \
+ " movl %%edx, 20(%3)\n" \
+ "10: movl 24(%4), %%eax\n" \
+ "51: movl 28(%4), %%edx\n" \
+ " movl %%eax, 24(%3)\n" \
+ " movl %%edx, 28(%3)\n" \
+ "11: movl 32(%4), %%eax\n" \
+ "61: movl 36(%4), %%edx\n" \
+ " movl %%eax, 32(%3)\n" \
+ " movl %%edx, 36(%3)\n" \
+ "12: movl 40(%4), %%eax\n" \
+ "71: movl 44(%4), %%edx\n" \
+ " movl %%eax, 40(%3)\n" \
+ " movl %%edx, 44(%3)\n" \
+ "13: movl 48(%4), %%eax\n" \
+ "81: movl 52(%4), %%edx\n" \
+ " movl %%eax, 48(%3)\n" \
+ " movl %%edx, 52(%3)\n" \
+ "14: movl 56(%4), %%eax\n" \
+ "91: movl 60(%4), %%edx\n" \
+ " movl %%eax, 56(%3)\n" \
+ " movl %%edx, 60(%3)\n" \
+ " addl $-64, %0\n" \
+ " addl $64, %4\n" \
+ " addl $64, %3\n" \
+ " cmpl $63, %0\n" \
+ " ja 0b\n" \
+ "5: movl %0, %%eax\n" \
+ " shrl $2, %0\n" \
+ " andl $3, %%eax\n" \
+ " cld\n" \
+ "6: rep; movsl\n" \
+ " movl %%eax,%0\n" \
+ "7: rep; movsb\n" \
+ "8:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "9: lea 0(%%eax,%0,4),%0\n" \
+ "16: pushl %0\n" \
+ " pushl %%eax\n" \
+ " xorl %%eax,%%eax\n" \
+ " rep; stosb\n" \
+ " popl %%eax\n" \
+ " popl %0\n" \
+ " jmp 8b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 0b,16b\n" \
+ " .long 1b,16b\n" \
+ " .long 2b,16b\n" \
+ " .long 21b,16b\n" \
+ " .long 3b,16b\n" \
+ " .long 31b,16b\n" \
+ " .long 4b,16b\n" \
+ " .long 41b,16b\n" \
+ " .long 10b,16b\n" \
+ " .long 51b,16b\n" \
+ " .long 11b,16b\n" \
+ " .long 61b,16b\n" \
+ " .long 12b,16b\n" \
+ " .long 71b,16b\n" \
+ " .long 13b,16b\n" \
+ " .long 81b,16b\n" \
+ " .long 14b,16b\n" \
+ " .long 91b,16b\n" \
+ " .long 6b,9b\n" \
+ " .long 7b,16b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (d0), "=&S" (d1) \
+ : "1"(to), "2"(from), "0"(size) \
+ : "eax", "edx", "memory"); \
+} while (0)
+
+/* These two will go inside copy_to/from_user() in usercopy.c */
+#define __do_copy_user(to,from,n) \
+do { \
+ if (is_rep_movsl_faster(to, from, n)) \
+ __copy_user_rep_movsl(to, from, n); \
+ else \
+ __copy_user_movl(to, from, n); \
+} while (0)
+
+#define __do_copy_user_zeroing(to,from,n) \
+do { \
+ if (is_rep_movsl_faster(to, from, n)) \
+ __copy_user_zeroing_rep_movsl(to,from,n); \
+ else \
+ __copy_user_zeroing_movl(to, from, n); \
+} while (0)
+
+#endif /* __i386_UACCESS_INTEL_H */
Akira Tsukamoto wrote:
>
> This consists mainly of the optimized copy routine for PIII/P4.
>
> It is basically identical to what was introduced in 2.5.45.
But you've inlined them again. Your patches increase my kernel
size by 17 kbytes, which is larger than my entire Layer 1 instruction
cache!
I'd prefer that we have these functions in .c, and laid out with
a minimum of C tricks. Because more work needs to be done on the
memory copy functions, and doing that in header files is a pain.
(That is, using the movnta instructions for well-aligned copies
and clears so that we don't read the destination memory while overwriting
it).
Hopefully, yes, we can end up removing the runtime-selectable alignment
mask. I left that in at present because it provides the infrastructure
for making other runtime-selectable decisions about how to perform
copies and clears. Distributors like to be able to ship a minimum
number of kernels (say, just a PII-compiled kernel) and we want those
to run as well as possible on PIII and P4.
Andrew Morton <[email protected]> writes:
> (That is, using the movnta instructions for well-aligned copies
> and clears so that we don't read the destination memory while overwriting
> it).
I did some experiments with movnta and it was near always a loss for
memcpy/copy_*_user type stuff. The reason is that it flushes the destination
out of cache and when you try to read it afterwards for some reason
(which happens often - e.g. most copy_*_user uses actually do access it
afterwards) then you eat a full cache miss for them and that is costly
and kills all other advantages.
It may be a win for direct copy-to-page cache and then page cache DMA
outside and page cache not mapped anywhere, but even then it's not completely
clear it's that helpful to have it not in cache. For example an Athlon
can serve an DMA directly out of its CPU caches and that may be
faster than serving it out of RAM (Intel CPUs cannot however)
-Andi
Andi Kleen wrote:
>
> Andrew Morton <[email protected]> writes:
>
> > (That is, using the movnta instructions for well-aligned copies
> > and clears so that we don't read the destination memory while overwriting
> > it).
>
> I did some experiments with movnta and it was near always a loss for
> memcpy/copy_*_user type stuff. The reason is that it flushes the destination
> out of cache and when you try to read it afterwards for some reason
> (which happens often - e.g. most copy_*_user uses actually do access it
> afterwards) then you eat a full cache miss for them and that is costly
> and kills all other advantages.
Oh. I was under the impression that the destination ended up in the
CPU caches.
Yes, if that's not the case then the whole thing is pretty useless.
> It may be a win for direct copy-to-page cache and then page cache DMA
> outside and page cache not mapped anywhere, but even then it's not completely
> clear it's that helpful to have it not in cache. For example an Athlon
> can serve an DMA directly out of its CPU caches and that may be
> faster than serving it out of RAM (Intel CPUs cannot however)
So it may be applicable to write(2) on intel.
> That depends on size. If you do huge memcpy (say 1 mb) it still
> wins by wide margin. Not that we do such huge operations often,
> but code can check size and pick different routines for small
> and big blocks
The kernel nevers does such huge memcpys. It rarely does handle any buffer
bigger than a page (4K)
-Andi
On 2 November 2002 08:58, Andi Kleen wrote:
> Andrew Morton <[email protected]> writes:
> > (That is, using the movnta instructions for well-aligned copies
> > and clears so that we don't read the destination memory while
> > overwriting it).
>
> I did some experiments with movnta and it was near always a loss for
> memcpy/copy_*_user type stuff. The reason is that it flushes the
> destination out of cache and when you try to read it afterwards for
> some reason (which happens often - e.g. most copy_*_user uses
> actually do access it afterwards) then you eat a full cache miss for
> them and that is costly and kills all other advantages.
That depends on size. If you do huge memcpy (say 1 mb) it still
wins by wide margin. Not that we do such huge operations often,
but code can check size and pick different routines for small
and big blocks
--
vda
On Sat, 02 Nov 2002 02:32:32 -0800
Andrew Morton <[email protected]> mentioned:
> Akira Tsukamoto wrote:
> >
> But you've inlined them again. Your patches increase my kernel
> size by 17 kbytes, which is larger than my entire Layer 1 instruction
> cache!
It is because I was working on this patch based on 2.5.44 :)
As Andi mentioned, how about selecting if OCNFIG_SAMLL is defined?
On 2 November 2002 10:09, Andi Kleen wrote:
> > That depends on size. If you do huge memcpy (say 1 mb) it still
> > wins by wide margin. Not that we do such huge operations often,
> > but code can check size and pick different routines for small
> > and big blocks
>
> The kernel nevers does such huge memcpys. It rarely does handle any
> buffer bigger than a page (4K)
Okay I take that.
How did you determined that movntXX stores are net loss?
I thought about that and didn't came to a working solution.
It's easy to time memcpy() but harder to measure susequent
cache misses when copied data gets accessed. We can read it
back after memcpy and measure memcpy()+read, but is entire
copy gets used immediately after memcpy() in real world usage?
We're in benchmarking hell :(
--
vda
> It's easy to time memcpy() but harder to measure susequent
> cache misses when copied data gets accessed. We can read it
> back after memcpy and measure memcpy()+read, but is entire
> copy gets used immediately after memcpy() in real world usage?
> We're in benchmarking hell :(
You test common operations, like pipe bandwidth or ioctls.
-Andi
Akira Tsukamoto wrote:
>
> On Sat, 02 Nov 2002 02:32:32 -0800
> Andrew Morton <[email protected]> mentioned:
>
> > Akira Tsukamoto wrote:
> > >
> > But you've inlined them again. Your patches increase my kernel
> > size by 17 kbytes, which is larger than my entire Layer 1 instruction
> > cache!
>
> It is because I was working on this patch based on 2.5.44 :)
>
> As Andi mentioned, how about selecting if OCNFIG_SAMLL is defined?
Well you could make it dependent on CONFIG_SLOW_KERNEL ;)
It's an act of faith - we have no benchmarks on this. But it
goes like this:
- subroutine calls are fast
- cache misses are slow
- kernel has no right to be evicting user code from the CPU cache
- smaller is faster
- inlining to the point of increasing code size is probably wrong
On Sat, 02 Nov 2002 10:13:46 -0800
Andrew Morton <[email protected]> mentioned:
> Well you could make it dependent on CONFIG_SLOW_KERNEL ;)
>From my patch, about the speed:
for PIII/4 CPU -> no change. using the same 2.5.45 copy.
for old i386 -> more optimal.
for Athlon -> 2.5.45 does not use unrolled copy for it either.
I am away for a while but I grew up in Japan and just wanted to save
some rooms for a embedded system like below.
http://linux.ascii24.com/linux/news/today/2000/05/22/imageview/images461056.jpg.html
The phisical size(not kernel) is about 5cm*5cm.
Is your copy function suitable in this case?
On Sat, 02 Nov 2002 10:13:46 -0800
Andrew Morton <[email protected]> mentioned:
> Akira Tsukamoto wrote:
> > > size by 17 kbytes, which is larger than my entire Layer 1 instruction
> > > cache!
> >
> > It is because I was working on this patch based on 2.5.44 :)
Oh, Oh,
I meant the above that I *do* agree you. I just wanted to say,
two function was kept inline because from before 2.4 until though 2.5.44,
it was inlined, and when I received the faster-intel-copy from
Takahashi, I was working my patch against 2.5.44.
Just move __copy_to/from_user to usercopy.c will decrease the
kernel size and should be almost the same as before.
Am I missing something?
> - cache misses are slow
> - kernel has no right to be evicting user code from the CPU cache
Is this relevant to this patch?
I did not change any in my patch about it.
> - subroutine calls are fast
You mean almost no overhead.
> - smaller is faster
It could be said more efficient but faster?
The code or binary size directly connected to this issue?
> - inlining to the point of increasing code size is probably wrong
I agree, as my first comment.
On Sat, 02 Nov 2002 02:32:32 -0800
Andrew Morton <[email protected]> mentioned:
> I'd prefer that we have these functions in .c, and laid out with
> a minimum of C tricks. Because more work needs to be done on the
> memory copy functions, and doing that in header files is a pain.
Personally I don't mind adding everything in .h or .c.
I just used the convention what string.h and string-486.h doing.
Isn't it confusing that adding everything in .c also?
Akira Tsukamoto wrote:
>
> > - smaller is faster
>
> It could be said more efficient but faster?
Yes, faster. From a whole-system point of view, which is after
all what we care about.
Cache misses are terribly expensive.
> The code or binary size directly connected to this issue?
>
Sure. Large cache footprints in the kernel require more cache
reloads by the kernel and cause more cache reloads on return
to userspace. Thrashing.
> From my patch, about the speed:
> for PIII/4 CPU -> no change. using the same 2.5.45 copy.
> for old i386 -> more optimal.
> for Athlon -> 2.5.45 does not use unrolled copy for it either.
OK. Please integrate you patch into the current kernel's usercopy.c.
> I am away for a while but I grew up in Japan and just wanted to save
> some rooms for a embedded system like below.
> http://linux.ascii24.com/linux/news/today/2000/05/22/imageview/images461056.jpg.html
> The phisical size(not kernel) is about 5cm*5cm.
> Is your copy function suitable in this case?
Well it won't hurt, but it looks like yours improves on it.
The thing which requires some thought is "should the decision
be made at compile time or runtime". For Athlon vs Intel
and i386 vs others, it should be performed at compile time.
On Sat, Nov 02, 2002 at 09:57:49PM -0500, Akira Tsukamoto wrote:
> Personally I don't mind adding everything in .h or .c.
> I just used the convention what string.h and string-486.h doing.
> Isn't it confusing that adding everything in .c also?
As a sidenote, string-486.h has been disabled for years
(Right back to at least 2.0.39 which is the earliest tree I have
to hand right now). It should either be fixed, or considered
worth dropping imo.
Dave
--
| Dave Jones. http://www.codemonkey.org.uk
On Sun, Nov 03, 2002 at 09:24:21PM +0000, Dave Jones wrote:
> As a sidenote, string-486.h has been disabled for years
> (Right back to at least 2.0.39 which is the earliest tree I have
> to hand right now). It should either be fixed, or considered
> worth dropping imo.
--- v1.3.75/linux/include/asm-i386/string.h Wed Feb 28 11:50:11 1996
+++ linux/include/asm-i386/string.h Mon Mar 18 10:52:08 1996
@@ -6,8 +6,11 @@
* byte string operations. But on a 386 or a PPro the
* byte string ops are faster than doing it by hand
* (MUCH faster on a Pentium).
+ *
+ * Also, the byte strings actually work correctly. Forget
+ * the i486 routines for now as they may be broken..
*/
-#if CPU == 486 || CPU == 586
+#if FIXED_486_STRING && (CPU == 486 || CPU == 586)
#include <asm/string-486.h>
#else
On Sat, 02 Nov 2002 20:04:32 -0800
Andrew Morton <[email protected]> mentioned:
> > From my patch, about the speed:
> > for PIII/4 CPU -> no change. using the same 2.5.45 copy.
> > for old i386 -> more optimal.
> > for Athlon -> 2.5.45 does not use unrolled copy for it either.
>
> OK. Please integrate you patch into the current kernel's usercopy.c.
I will make a revised patch, remove inline and putting inside usercopy.c
> The thing which requires some thought is "should the decision
> be made at compile time or runtime". For Athlon vs Intel
> and i386 vs others, it should be performed at compile time.
I run faster_intel_copy on my Athlon and works OK and much much faster,
so how about grouping CPU type,
generic i386/i486
use org REP MOVSL copy
generic i586
keep as the current 5.45
use revised REP MOVSL copy
generic i686
use revised REP MOVSL copy and unrolled MOVL
if SSE or 3DNOW comes out select them by MPENTIUMIII/4/K7