// $Header$
// Kernel Version:
// VERSION = 2
// PATCHLEVEL = 4
// SUBLEVEL = 1
// EXTRAVERSION =
--- 2.4/arch/i386/config.in Sat Feb 3 14:02:24 2001
+++ build-2.4/arch/i386/config.in Fri Feb 9 15:52:19 2001
@@ -91,6 +91,7 @@
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_PGE y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+ define_bool CONFIG_X86_USE_SSE y
fi
if [ "$CONFIG_MPENTIUM4" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 7
@@ -98,6 +99,7 @@
define_bool CONFIG_X86_GOOD_APIC y
define_bool CONFIG_X86_PGE y
define_bool CONFIG_X86_USE_PPRO_CHECKSUM y
+ define_bool CONFIG_X86_USE_SSE y
fi
if [ "$CONFIG_MK6" = "y" ]; then
define_int CONFIG_X86_L1_CACHE_SHIFT 5
--- 2.4/arch/i386/kernel/i386_ksyms.c Sat Feb 3 14:02:24 2001
+++ build-2.4/arch/i386/kernel/i386_ksyms.c Fri Feb 9 15:52:19 2001
@@ -117,6 +117,11 @@
EXPORT_SYMBOL(mmx_copy_page);
#endif
+#ifdef CONFIG_X86_USE_SSE
+EXPORT_SYMBOL(sse_clear_page);
+EXPORT_SYMBOL(sse_copy_page);
+#endif
+
#ifdef CONFIG_SMP
EXPORT_SYMBOL(cpu_data);
EXPORT_SYMBOL(kernel_flag);
diff -urN --exclude .depend 2.4/arch/i386/lib/Makefile build-2.4/arch/i386/lib/Makefile
--- 2.4/arch/i386/lib/Makefile Sat Feb 3 14:02:24 2001
+++ build-2.4/arch/i386/lib/Makefile Fri Feb 9 15:52:19 2001
@@ -12,6 +12,7 @@
memcpy.o
obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
+obj-$(CONFIG_X86_USE_SSE) += sse.o
obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
include $(TOPDIR)/Rules.make
diff -urN --exclude .depend 2.4/arch/i386/lib/sse.c build-2.4/arch/i386/lib/sse.c
--- 2.4/arch/i386/lib/sse.c Thu Jan 1 01:00:00 1970
+++ build-2.4/arch/i386/lib/sse.c Fri Feb 9 15:52:19 2001
@@ -0,0 +1,89 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/i387.h>
+
+/*
+ * SSE library helper functions
+ *
+ * Copyright (C) 2001 Manfred Spraul
+ *
+ * Based on Intel sample code from
+ * Block Copy Using Pentium(R) III Streaming SIMD Extensions
+ * Revision 1.9
+ * January 12, 1999
+ *
+ */
+
+
+void sse_clear_page(void * page)
+{
+ int storage[4];
+ int d0, d1, d2;
+ __asm__ __volatile__(
+ "mov %%cr0, %2\n\t"
+ "clts\n\t"
+ "movups %%xmm0, (%3)\n\t"
+ "xorps %%xmm0, %%xmm0\n\t"
+ "xor %0, %0\n\t"
+ "1: movntps %%xmm0, (%1)\n\t"
+ "movntps %%xmm0, 16(%1)\n\t"
+ "movntps %%xmm0, 32(%1)\n\t"
+ "movntps %%xmm0, 48(%1)\n\t"
+ "movntps %%xmm0, 64(%1)\n\t"
+ "movntps %%xmm0, 80(%1)\n\t"
+ "movntps %%xmm0, 96(%1)\n\t"
+ "movntps %%xmm0, 112(%1)\n\t"
+ "add $128, %1\n\t"
+ "inc %0\n\t"
+ "cmp $32, %0\n\t"
+ "jne 1b\n\t"
+ "movups (%3), %%xmm0\n\t"
+ "sfence\n\t"
+ "mov %2, %%cr0\n\t"
+ : "=&r" (d0), "=&r" (d1), "=&r" (d2)
+ : "r" (&storage), "1" (page)
+ : "cc", "memory");
+}
+
+void sse_copy_page(void *to, void *from)
+{
+ int storage[16];
+ int d0, d1, d2, d3;
+ __asm__ __volatile__(
+ "mov %%cr0, %3\n\t" /* step 1: enable the FPU */
+ "clts\n\t"
+ "movups %%xmm0, (%4)\n\t" /* step 2: save the clobbered regs */
+ "movups %%xmm1, 16(%4)\n\t"
+ "movups %%xmm2, 32(%4)\n\t"
+ "movups %%xmm3, 48(%4)\n\t"
+ "mov (%2), %0\n\t" /* step 3: load the TLB */
+ "xor %0, %0\n\t" /* step 4: prefetch the page */
+ "1:prefetchnta (%2, %0)\n\t"
+ "prefetchnta 32(%2, %0)\n\t"
+ "add $64,%0\n\t"
+ "cmp $4096, %0\n\t"
+ "jne 1b\n\t"
+ "2: movaps (%2), %%xmm0\n\t" /* step 5: copy the page */
+ "movaps 16(%2), %%xmm1\n\t"
+ "movaps 32(%2), %%xmm2\n\t"
+ "movaps 48(%2), %%xmm3\n\t"
+ "add $64, %2\n\t"
+ "movntps %%xmm0, (%1)\n\t"
+ "movntps %%xmm1, 16(%1)\n\t"
+ "movntps %%xmm2, 32(%1)\n\t"
+ "movntps %%xmm3, 48(%1)\n\t"
+ "add $64, %1\n\t"
+ "sub $64, %0\n\t"
+ "jnz 2b\n\t"
+ "movups (%4), %%xmm0\n\t" /* step 6: restore the clobbered regs */
+ "movups 16(%4), %%xmm1\n\t"
+ "movups 32(%4), %%xmm2\n\t"
+ "movups 48(%4), %%xmm3\n\t"
+ "sfence\n\t"
+ "mov %3, %%cr0\n\t" /* step 7: restore cr0 */
+ : "=&r" (d0), "=&r" (d1), "=&r" (d2), "=&r" (d3)
+ : "r" (&storage), "1" (to), "2" (from)
+ : "cc", "memory");
+}
diff -urN 2.4/include/asm-i386/page.h build-2.4/include/asm-i386/page.h
--- 2.4/include/asm-i386/page.h Thu Jan 4 23:50:46 2001
+++ build-2.4/include/asm-i386/page.h Fri Feb 9 15:52:19 2001
@@ -11,7 +11,14 @@
#include <linux/config.h>
-#ifdef CONFIG_X86_USE_3DNOW
+#ifdef CONFIG_X86_USE_SSE
+
+#include <asm/sse.h>
+
+#define clear_page(page) sse_clear_page(page)
+#define copy_page(to,from) sse_copy_page(to,from)
+
+#elif defined(CONFIG_X86_USE_3DNOW)
#include <asm/mmx.h>
diff -urN 2.4/include/asm-i386/sse.h build-2.4/include/asm-i386/sse.h
--- 2.4/include/asm-i386/sse.h Thu Jan 1 01:00:00 1970
+++ build-2.4/include/asm-i386/sse.h Fri Feb 9 17:26:34 2001
@@ -0,0 +1,11 @@
+#ifndef _ASM_SSE_H
+#define _ASM_SSE_H
+
+/*
+ * SSE helper operations
+ */
+
+extern void sse_clear_page(void *page);
+extern void sse_copy_page(void *to, void *from);
+
+#endif
In article <[email protected]>,
Manfred Spraul <[email protected]> wrote:
>
>* use sse for normal memcopy. Then main advantage of sse over mmx is
>that only the clobbered registers must be saved, not the full fpu state.
>
>* verify that the code doesn't break SSE enabled apps.
>I checked a sse enabled mp3 encoder and Mesa.
Ehh.. Did you try this with pending FPU exceptions that have not yet
triggered?
I have this strong suspicion that your kernel will lock up in a bad way
of you have somebody do something like divide by zero without actually
touching a single FP instruction after the divide (so that the error has
happened, but has not yet been raised as an exception).
And when it hits your SSE copy routines with the pending error, it will
likely loop forever on taking the fault in kernel space.
Basically, kernel use of MMX and SSE are a lot harder to get right than
many people seem to realize. Why do you think I threw out all the
patches that tried to do this?
And no, the bug won't show up in any normal testing. You'll never know
about it until somebody malicious turns your machine into a doorstop.
Finally, did you actually see any performance gain in any benchmarks?
Linus
Linus Torvalds wrote:
>
> In article <[email protected]>,
> Manfred Spraul <[email protected]> wrote:
> >
> >* use sse for normal memcopy. Then main advantage of sse over mmx is
> >that only the clobbered registers must be saved, not the full fpu state.
> >
> >* verify that the code doesn't break SSE enabled apps.
> >I checked a sse enabled mp3 encoder and Mesa.
>
> Ehh.. Did you try this with pending FPU exceptions that have not yet
> triggered?
There are other gotcha's here as well (speaking from experience :-(
> I have this strong suspicion that your kernel will lock up in a bad way
> of you have somebody do something like divide by zero without actually
> touching a single FP instruction after the divide (so that the error has
> happened, but has not yet been raised as an exception).
Or much worse, let the kernel mix-and-match SSE and MMX optimized routines
without doing full saves of the FPU on SSE routines, which leads to FPU saves
in MMX routines with kernel data in the SSE registers, which then shows up
when the app touches those SSE registers and you get use space corruption. My
code to handle this type of situation was *very* complex, and I don't think I
ever got it quite perfectly right without simply imposing a rule that the
kernel could never use both SSE and MMX instructions on the same CPU.
> And when it hits your SSE copy routines with the pending error, it will
> likely loop forever on taking the fault in kernel space.
>
> Basically, kernel use of MMX and SSE are a lot harder to get right than
> many people seem to realize. Why do you think I threw out all the
> patches that tried to do this?
>
> And no, the bug won't show up in any normal testing. You'll never know
> about it until somebody malicious turns your machine into a doorstop.
>
> Finally, did you actually see any performance gain in any benchmarks?
Now this I can comment on. Real life gain. We had to re-run a SpecWEB99 test
after taking the SSE instructions out of our (buggy) 2.2.14-5.0 kernel that
shipped in Red Hat Linux 6.2 (hanging bag over my head). We recompiled the
exact same kernel without the SSE stuff, then re-ran the SpecWEB99 test, and
there was an performance drop of roughly 2% overall from the change. So, yes,
they make an aggregate performance difference to the typical running of the
kernel (and this was without copy_page or zero_page being optimized, instead I
had done memset, memcpy, copy_*_user as optimized routines) and they make a
huge difference to benchmarks that target the areas they help the most (like
disk I/O benchmarks which would sometimes see a 40% or more boost in
performance).
--
Doug Ledford <[email protected]> http://people.redhat.com/dledford
Please check my web site for aic7xxx updates/answers before
e-mailing me about problems
Doug Ledford wrote:
>
> > I have this strong suspicion that your kernel will lock up in a bad way
> > of you have somebody do something like divide by zero without actually
> > touching a single FP instruction after the divide (so that the error has
> > happened, but has not yet been raised as an exception).
>
> Or much worse, let the kernel mix-and-match SSE and MMX optimized routines
> without doing full saves of the FPU on SSE routines, which leads to FPU saves
> in MMX routines with kernel data in the SSE registers, which then shows up
> when the app touches those SSE registers and you get use space corruption. My
> code to handle this type of situation was *very* complex, and I don't think I
> ever got it quite perfectly right without simply imposing a rule that the
> kernel could never use both SSE and MMX instructions on the same CPU.
>
I don't see that problem:
* sse_{copy,clear}_page() restore the sse registers before returning.
* the fpu saves into current->thread.i387.f{,x}save never happen from
interrupts.
How can kernel sse values end up in user space? I'm sure I overlook
something, but what?
--
Manfred
Manfred Spraul wrote:
>
> Doug Ledford wrote:
> >
> > > I have this strong suspicion that your kernel will lock up in a bad way
> > > of you have somebody do something like divide by zero without actually
> > > touching a single FP instruction after the divide (so that the error has
> > > happened, but has not yet been raised as an exception).
> >
> > Or much worse, let the kernel mix-and-match SSE and MMX optimized routines
> > without doing full saves of the FPU on SSE routines, which leads to FPU saves
> > in MMX routines with kernel data in the SSE registers, which then shows up
> > when the app touches those SSE registers and you get use space corruption. My
> > code to handle this type of situation was *very* complex, and I don't think I
> > ever got it quite perfectly right without simply imposing a rule that the
> > kernel could never use both SSE and MMX instructions on the same CPU.
> >
>
> I don't see that problem:
> * sse_{copy,clear}_page() restore the sse registers before returning.
> * the fpu saves into current->thread.i387.f{,x}save never happen from
> interrupts.
>
> How can kernel sse values end up in user space? I'm sure I overlook
> something, but what?
It's not whether or not your particular code does it. It's whether or not it
can happen in the framework within which you are using the FPU regs. No, with
just copy/clear page using these things it won't happen. But if you add an
SSE zero page function, who's to say that we shouldn't add a memset routine,
or a copy_*_user routines, or copy_csum* routines that also use the SSE regs?
And once you add those various routines, are they all going to be safe with
respect to each other (the tricky one's here are if you add the copy_*_user
stuff since they can pagefault in the middle of the operation)? Plus, if you
have both SSE and MMX and maybe even 3DNow operations, are you going to pick
the fastest of each on each processor that supports each type, meaning you may
use one SSE and one MMX routine on the same processor if the MMX routine just
happens to beat out the SSE routine for a particular task? Once you take all
these things into consideration, the question becomes what limit are you going
to place on kernel SSE register usage? Because if you don't severly limit its
usage, then you have to handle all the odd scenarios above, and that's where
it gets very difficult to get it right. So, that's the policy decision that
needs to be made (and Linus typically has made it very difficult to get this
stuff accepted into the kernel, which is an implicit statement of that policy)
before a person can decide if your patch is sufficient, or if it needs
additional protection from other possible SSE/MMX using routines.
--
Doug Ledford <[email protected]> http://people.redhat.com/dledford
Please check my web site for aic7xxx updates/answers before
e-mailing me about problems
Doug Ledford wrote:
>
> It's not whether or not your particular code does it. It's whether or not it
> can happen in the framework within which you are using the FPU regs. No, with
> just copy/clear page using these things it won't happen. But if you add an
> SSE zero page function, who's to say that we shouldn't add a memset routine,
> or a copy_*_user routines, or copy_csum* routines that also use the SSE regs?
> And once you add those various routines, are they all going to be safe with
> respect to each other (the tricky one's here are if you add the copy_*_user
> stuff since they can pagefault in the middle of the operation)?
copy_*_user is probably not worth the effort for a Pentium III, but even
for that function I don't see a problem with SSE, as long as
* the clobbered registers are stored on the stack (and not in
thread.i387.fxsave)
* the SSE/SSE2 instructions can't cause SIMD exceptions.
* noone saves the fpu state into thread.i387.fxsave from interrupts /
softirq's. Currently it's impossible, but I haven't checked Montavista's
preemptive kernel scheduler.
> So, that's the policy decision that
> needs to be made (and Linus typically has made it very difficult to get this
> stuff accepted into the kernel, which is an implicit statement of that policy)
> before a person can decide if your patch is sufficient, or if it needs
> additional protection from other possible SSE/MMX using routines.
>
The policy decision was already done: someone added SSE support for
raid5 xor - and that's part of 2.4.1, whereas I proposed a beta patch.
Now back to raid5: in which context are the xor functions called?
If they are called from irq or softirq context then the MMX
implementation would contain a bug:
>>>
#define FPU_SAVE
\
do {
\
if (!(current->flags & PF_USEDFPU))
\
__asm__ __volatile__ (" clts;\n");
\
__asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));
\
} while (0)
<<<<<<<
FP_USEDFPU is not atomically following the bit in %%cr0.
The SSE code is not affected: it relies on %%cr0 and doesn't use
current->flags.
OTHO if they are called from process context then these functions might
cause bugs with Montavista's preemptive kernel scheduling: what if the
scheduler is called in the middle of a raid checksum?
--
Manfred
Manfred Spraul wrote:
>
> copy_*_user is probably not worth the effort for a Pentium III, but even
> for that function I don't see a problem with SSE, as long as
>
> * the clobbered registers are stored on the stack (and not in
> thread.i387.fxsave)
> * the SSE/SSE2 instructions can't cause SIMD exceptions.
> * noone saves the fpu state into thread.i387.fxsave from interrupts /
> softirq's. Currently it's impossible, but I haven't checked Montavista's
> preemptive kernel scheduler.
>
I overlooked one restriction:
* you must not schedule() with the "wrong" sse registers: switch_to()
saves into i387.fxsave.
This means that copy_*_user isn't that simple.
--
Manfred
--- 2.4/mm/filemap.c Wed Feb 14 10:51:42 2001
+++ build-2.4/mm/filemap.c Wed Feb 14 22:11:44 2001
@@ -1248,6 +1248,20 @@
size = count;
kaddr = kmap(page);
+ if (size > 128) {
+ int i;
+ __asm__ __volatile__(
+ "mov %1, %0\n\t"
+ : "=r" (i)
+ : "r" (kaddr+offset)); /* load tlb entry */
+ for(i=0;i<size;i+=64) {
+ __asm__ __volatile__(
+ "prefetchnta (%1, %0)\n\t"
+ "prefetchnta 32(%1, %0)\n\t"
+ : /* no output */
+ : "r" (i), "r" (kaddr+offset));
+ }
+ }
left = __copy_to_user(desc->buf, kaddr + offset, size);
kunmap(page);
Manfred Spraul wrote:
>
> Intel Pentium III and P 4 have hardcoded "fast stringcopy" operations
> that invalidate whole cachelines during write (documented in the most
> obvious place: multiprocessor management, memory ordering)
Which are dramatically slower than a simple `mov' loop for just
about all alignments, except for source and dest both eight-byte
aligned.
For example, copying an unchached source to an uncached dest,
with the source misaligned, my PIII Coppermine does 108 MBytes/sec
with `rep;movsl' and 149 MBytes/sec with an open-coded variant
of our copy_csum routines. That's a lot. Similar results
on a PII and a PIII Katmai.
On the K6-2, however, the string operation is almost always
a win.
It seems that a good approximation for our bulk-copy strategy is:
if (AMD) {
string_copy();
} else if (intel) {
if ((source|dest) & 7)
duff_copy();
else
string_copy();
} else {
quack();
}
This will make our Intel copies 20-40% faster than
at present, depending upon the distribution of
alignments. (And for networking, the distribution
is pretty much uniform).
Somewhere on my to-do list is getting lots of people to
test lots of architectures with lots of combinations of
[source/dest][cached/uncached] at lots of alignments
to confirm if this will work.
If you have time, could you please grab
http://www.uow.edu.au/~andrewm/linux/cptimer.tar.gz
and teach it how to do SSE copies, in preparation for this
great event?
Thanks.
-
Hi!
> --- 2.4/mm/filemap.c Wed Feb 14 10:51:42 2001
> +++ build-2.4/mm/filemap.c Wed Feb 14 22:11:44 2001
> @@ -1248,6 +1248,20 @@
> size = count;
>
> kaddr = kmap(page);
> + if (size > 128) {
> + int i;
> + __asm__ __volatile__(
> + "mov %1, %0\n\t"
> + : "=r" (i)
> + : "r" (kaddr+offset)); /* load tlb entry */
> + for(i=0;i<size;i+=64) {
> + __asm__ __volatile__(
> + "prefetchnta (%1, %0)\n\t"
> + "prefetchnta 32(%1, %0)\n\t"
> + : /* no output */
> + : "r" (i), "r" (kaddr+offset));
> + }
> + }
> left = __copy_to_user(desc->buf, kaddr + offset, size);
> kunmap(page);
This seems bogus -- you need to handle faults --
i.e. __prefetchnta_to_user() ;-).
Pavel
--
I'm [email protected]. "In my country we have almost anarchy and I don't care."
Panos Katsaloulis describing me w.r.t. patents at [email protected]
> > + __asm__ __volatile__(
> > + "mov %1, %0\n\t"
> > + : "=r" (i)
> > + : "r" (kaddr+offset)); /* load tlb entry */
> > + for(i=0;i<size;i+=64) {
> > + __asm__ __volatile__(
> > + "prefetchnta (%1, %0)\n\t"
> > + "prefetchnta 32(%1, %0)\n\t"
> > + : /* no output */
> > + : "r" (i), "r" (kaddr+offset));
> > + }
> > + }
> > left = __copy_to_user(desc->buf, kaddr + offset, size);
> > kunmap(page);
>
> This seems bogus -- you need to handle faults --
> i.e. __prefetchnta_to_user() ;-).
It wants wrapping nicely. A generic prefetch and prefetchw does help some other
cases (scheduler for one).
Does the prefetch instruction fault on PIII/PIV then - the K7 one appears not
to be a source of faults
> > > + __asm__ __volatile__(
> > > + "mov %1, %0\n\t"
> > > + : "=r" (i)
> > > + : "r" (kaddr+offset)); /* load tlb entry */
> > > + for(i=0;i<size;i+=64) {
> > > + __asm__ __volatile__(
> > > + "prefetchnta (%1, %0)\n\t"
> > > + "prefetchnta 32(%1, %0)\n\t"
> > > + : /* no output */
> > > + : "r" (i), "r" (kaddr+offset));
> > > + }
> > > + }
> > > left = __copy_to_user(desc->buf, kaddr + offset, size);
> > > kunmap(page);
> >
> > This seems bogus -- you need to handle faults --
> > i.e. __prefetchnta_to_user() ;-).
>
> It wants wrapping nicely. A generic prefetch and prefetchw does help some other
> cases (scheduler for one).
>
> Does the prefetch instruction fault on PIII/PIV then - the K7 one appears not
> to be a source of faults
My fault. I was told that prefetch instructions are always
non-faulting.
Pavel
--
The best software in life is free (not shareware)! Pavel
GCM d? s-: !g p?:+ au- a--@ w+ v- C++@ UL+++ L++ N++ E++ W--- M- Y- R+
> > Does the prefetch instruction fault on PIII/PIV then - the K7 one appears not
> > to be a source of faults
>
> My fault. I was told that prefetch instructions are always
> non-faulting.
I also thought it was non faulting
Pavel Machek wrote:
>
> > > > + __asm__ __volatile__(
> > > > + "mov %1, %0\n\t"
> > > > + : "=r" (i)
> > > > + : "r" (kaddr+offset)); /* load tlb entry */
> > > > + for(i=0;i<size;i+=64) {
> > > > + __asm__ __volatile__(
> > > > + "prefetchnta (%1, %0)\n\t"
> > > > + "prefetchnta 32(%1, %0)\n\t"
> > > > + : /* no output */
> > > > + : "r" (i), "r" (kaddr+offset));
> > > > + }
> > > > + }
> > > > left = __copy_to_user(desc->buf, kaddr + offset, size);
> > > > kunmap(page);
> > >
> > > This seems bogus -- you need to handle faults --
> > > i.e. __prefetchnta_to_user() ;-).
> >
Ahm. That's file_read_actor, not file_write_actor ;-)
I'm prefetching the kernel space buffer.
> > It wants wrapping nicely. A generic prefetch and prefetchw does help some other
> > cases (scheduler for one).
> >
> > Does the prefetch instruction fault on PIII/PIV then - the K7 one appears not
> > to be a source of faults
>
> My fault. I was told that prefetch instructions are always
> non-faulting.
>
But there is another problem:
The tlb preloading with a simple 'mov' is not enough:
the Pentium III cpu decodes the 'mov', begins to load the tlb entry -
this will take at least several dozend cpu ticks.
But the cpu continues to decode further instructions. It sees the
'prefetchnta', notices that the tlb entry is not loaded and ignores the
next prefetchnta's (prefetch without tlb is turned into NOP).
--
Manfred