Chris Lesiak reported that changes to i386's __memcpy() broke his device
because it can't handle byte moves and the new code uses them for
all trailing bytes when the length is not divisible by four. The old
code tried to use a 16-bit move and/or a byte move as needed.
H. Peter Anvin:
"There are only a few semantics that make sense: fixed 8, 16, 32, or 64
bits, plus "optimal"; the latter to be used for anything that doesn't
require a specific transfer size. Logically, an unqualified
"memcpy_to/fromio" should be the optimal size (as few transfers as
possible)"
So add back the old code as __minimal_memcpy and have IO transfers
use that.
Signed-off-by: Chuck Ebbert <[email protected]>
---
include/asm-i386/io.h | 4 ++--
include/asm-i386/string.h | 21 +++++++++++++++++++++
2 files changed, 23 insertions(+), 2 deletions(-)
--- 2.6.17-rc5-32.orig/include/asm-i386/io.h
+++ 2.6.17-rc5-32/include/asm-i386/io.h
@@ -202,11 +202,11 @@ static inline void memset_io(volatile vo
}
static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
{
- __memcpy(dst, (void __force *) src, count);
+ __minimal_memcpy(dst, (void __force *) src, count);
}
static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
{
- __memcpy((void __force *) dst, src, count);
+ __minimal_memcpy((void __force *) dst, src, count);
}
/*
--- 2.6.17-rc5-32.orig/include/asm-i386/string.h
+++ 2.6.17-rc5-32/include/asm-i386/string.h
@@ -220,6 +220,28 @@ return (to);
}
/*
+ * Do memcpy with as few moves as possible (for transfers to/from IO space.)
+ */
+static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
+{
+int d0, d1, d2;
+__asm__ __volatile__(
+ "rep ; movsl\n\t"
+ "testb $2,%b4\n\t"
+ "jz 1f\n\t"
+ "movsw\n"
+ "1:\n\t"
+ "testb $1,%b4\n\t"
+ "jz 2f\n\t"
+ "movsb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ :"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
+ : "memory");
+return to;
+}
+
+/*
* This looks ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
--
Chuck
One concern: this is not the standard return value for memcpy(). It either needs a
comment to that effect (stating it returns a pointer to the end of the area), or just make
it return void.
Also, the formatting looks nonstandard.
>
> /*
> + * Do memcpy with as few moves as possible (for transfers to/from IO space.)
> + */
> +static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
> +{
> +int d0, d1, d2;
> +__asm__ __volatile__(
> + "rep ; movsl\n\t"
> + "testb $2,%b4\n\t"
> + "jz 1f\n\t"
> + "movsw\n"
> + "1:\n\t"
> + "testb $1,%b4\n\t"
> + "jz 2f\n\t"
> + "movsb\n"
> + "2:"
> + : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> + :"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
> + : "memory");
> +return to;
> +}
-hpa
/*
* arch/i386/lib/memcpy_io.S
*
* The most general form of memory copy to/from I/O space, used for
* devices which can handle arbitrary transactions with appropriate
* handling of byte enables. The goal is to produce the minimum
* number of naturally aligned transactions on the bus.
*/
#include <linux/config.h>
.globl memcpy_toio
.type memcpy_toio, @function
memcpy_toio:
pushl %edi
pushl %esi
#ifdef CONFIG_REGPARM
movl %eax, %edi
movl %edx, %esi
#else
movl 12(%esp), %eax
movl 16(%esp), %edx
movl 20(%esp), %ecx
#endif
jecxz 1f
testl $1, %edi
jz 2f
movsb
decl %ecx
2:
cmpl $2, %ecx
jb 3f
testl $2, %edi
jz 4f
movsw
decl %ecx
decl %ecx
4:
movl %ecx, %edx
shrl $2, %ecx
jz 5f
rep ; movsl
5:
movl %edx, %ecx
testb $2, %cl
jz 3f
movsw
3:
testb $1, %cl
jz 1f
movsb
1:
pop %esi
pop %edi
ret
.size memcpy_toio, .-memcpy_toio
.globl memcpy_toio
.type memcpy_fromio, @function
memcpy_fromio:
pushl %edi
pushl %esi
#ifdef CONFIG_REGPARM
movl %eax, %edi
movl %edx, %esi
#else
movl 12(%esp), %eax
movl 16(%esp), %edx
movl 20(%esp), %ecx
#endif
jecxz 1f
testl $1, %esi
jz 2f
movsb
decl %ecx
2:
cmpl $2, %ecx
jb 3f
testl $2, %esi
jz 4f
movsw
decl %ecx
decl %ecx
4:
movl %ecx, %edx
shrl $2, %ecx
jz 5f
rep ; movsl
5:
movl %edx, %ecx
testb $2, %cl
jz 3f
movsw
3:
testb $1, %cl
jz 1f
movsb
1:
pop %esi
pop %edi
ret
.size memcpy_fromio, .-memcpy_fromio
On Tue, 30 May 2006, Chuck Ebbert wrote:
> Chris Lesiak reported that changes to i386's __memcpy() broke his device
> because it can't handle byte moves and the new code uses them for
> all trailing bytes when the length is not divisible by four. The old
> code tried to use a 16-bit move and/or a byte move as needed.
>
> H. Peter Anvin:
> "There are only a few semantics that make sense: fixed 8, 16, 32, or 64
> bits, plus "optimal"; the latter to be used for anything that doesn't
> require a specific transfer size. Logically, an unqualified
> "memcpy_to/fromio" should be the optimal size (as few transfers as
> possible)"
>
> So add back the old code as __minimal_memcpy and have IO transfers
> use that.
>
> Signed-off-by: Chuck Ebbert <[email protected]>
>
> ---
>
> include/asm-i386/io.h | 4 ++--
> include/asm-i386/string.h | 21 +++++++++++++++++++++
> 2 files changed, 23 insertions(+), 2 deletions(-)
>
> --- 2.6.17-rc5-32.orig/include/asm-i386/io.h
> +++ 2.6.17-rc5-32/include/asm-i386/io.h
> @@ -202,11 +202,11 @@ static inline void memset_io(volatile vo
> }
> static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
> {
> - __memcpy(dst, (void __force *) src, count);
> + __minimal_memcpy(dst, (void __force *) src, count);
> }
> static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count)
> {
> - __memcpy((void __force *) dst, src, count);
> + __minimal_memcpy((void __force *) dst, src, count);
> }
>
> /*
> --- 2.6.17-rc5-32.orig/include/asm-i386/string.h
> +++ 2.6.17-rc5-32/include/asm-i386/string.h
> @@ -220,6 +220,28 @@ return (to);
> }
>
> /*
> + * Do memcpy with as few moves as possible (for transfers to/from IO space.)
> + */
> +static inline void * __minimal_memcpy(void * to, const void * from, size_t n)
> +{
> +int d0, d1, d2;
> +__asm__ __volatile__(
> + "rep ; movsl\n\t"
> + "testb $2,%b4\n\t"
> + "jz 1f\n\t"
> + "movsw\n"
> + "1:\n\t"
> + "testb $1,%b4\n\t"
> + "jz 2f\n\t"
> + "movsb\n"
> + "2:"
> + : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> + :"0" (n/4), "q" (n), "1" ((long) to), "2" ((long) from)
> + : "memory");
> +return to;
> +}
> +
> +/*
> * This looks ugly, but the compiler can optimize it totally,
> * as the count is constant.
> */
> + "rep ; movsl\n\t"
^^^^^^^^^^^^^^^^^^^^^^^^
Huh, you copyied stuff into three variables, then you needed to make
memory accesses to those variables, and you claim "the compiler can
optimize it totally". Sure. I got a bridge I'd like to sell you.
Please, instead of guessing, why don't measure the exact number of
CPU cycles with rdtsc?
> --
> Chuck
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Cheers,
Dick Johnson
Penguin : Linux version 2.6.16.4 on an i686 machine (5592.73 BogoMips).
New book: http://www.AbominableFirebug.com/
_
****************************************************************
The information transmitted in this message is confidential and may be privileged. Any review, retransmission, dissemination, or other use of this information by persons or entities other than the intended recipient is prohibited. If you are not the intended recipient, please notify Analogic Corporation immediately - by replying to this message or by sending an email to [email protected] - and destroy all copies of this information, including any attachments, without reading or disclosing them.
Thank you.