After some more review comments from Roland, Andrew and Chris Hellwig,
here is a reworked set of 32-bit MMIO copy patches.
These use CONFIG_RAW_MEMCPY_IO to determine whether an arch should use
the generic __raw_memcpy_toio32 routine or its own specialised version.
We provide a specialised implementation for x86_64.
These patches should apply cleanly against current -git, and have been
tested on i386 and x86_64.
The patch series is as follows:
raw_memcpy_io.patch
Introduce the generic MMIO 32-bit copy routine.
x86_64-memcpy32.patch
Add memcpy32 routine to x86_64.
arch-specific-raw_memcpy_io.patch
Get each arch to use generic memcpy_io code, except x86_64, which
uses memcpy32.
Signed-off-by: Bryan O'Sullivan <[email protected]>
Introduce an x86_64-specific memcpy32 routine. The routine is similar
to memcpy, but is guaranteed to work in units of 32 bits at a time.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/kernel/x8664_ksyms.c
--- a/arch/x86_64/kernel/x8664_ksyms.c Tue Jan 10 11:52:46 2006 -0800
+++ b/arch/x86_64/kernel/x8664_ksyms.c Tue Jan 10 11:52:48 2006 -0800
@@ -164,6 +164,8 @@
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL_GPL(memcpy32);
+
#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
/* prototypes are wrong, these are assembly with custom calling functions */
extern void rwsem_down_read_failed_thunk(void);
diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/lib/Makefile
--- a/arch/x86_64/lib/Makefile Tue Jan 10 11:52:46 2006 -0800
+++ b/arch/x86_64/lib/Makefile Tue Jan 10 11:52:48 2006 -0800
@@ -9,4 +9,4 @@
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += memcpy.o memcpy32.o memmove.o memset.o copy_user.o
diff -r 2d4af213d9c5 -r b4863171295f include/asm-x86_64/string.h
--- a/include/asm-x86_64/string.h Tue Jan 10 11:52:46 2006 -0800
+++ b/include/asm-x86_64/string.h Tue Jan 10 11:52:48 2006 -0800
@@ -45,6 +45,9 @@
#define __HAVE_ARCH_MEMMOVE
void * memmove(void * dest,const void *src,size_t count);
+/* copy data, 32 bits at a time */
+void memcpy32(void *dst, const void *src, size_t count);
+
/* Use C out of line version for memcmp */
#define memcmp __builtin_memcmp
int memcmp(const void * cs,const void * ct,size_t count);
diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/lib/memcpy32.S
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/arch/x86_64/lib/memcpy32.S Tue Jan 10 11:52:48 2006 -0800
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * Registers used below:
+ * dst - rdi
+ * src - rsi
+ * count - rdx
+ */
+
+/**
+ * memcpy32 - copy data, in units of 32 bits at a time
+ * @dst: destination (must be 32-bit aligned)
+ * @src: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ */
+ .globl memcpy32
+memcpy32:
+ movl %edx,%ecx
+ shrl $1,%ecx
+ andl $1,%edx
+ rep movsq
+ movl %edx,%ecx
+ rep movsd
+ ret
This arch-independent routine copies data to a memory-mapped I/O region,
using 32-bit accesses. It does not guarantee access ordering, nor does
it perform a memory barrier afterwards. This style of access is required
by some devices.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r 48616306e7bd -r 2d4af213d9c5 lib/Makefile
--- a/lib/Makefile Tue Jan 10 10:41:42 2006 +0800
+++ b/lib/Makefile Tue Jan 10 11:52:46 2006 -0800
@@ -21,6 +21,7 @@
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
+lib-$(CONFIG_GENERIC_RAW_MEMCPY_IO) += raw_memcpy_io.o
obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
diff -r 48616306e7bd -r 2d4af213d9c5 lib/raw_memcpy_io.c
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/lib/raw_memcpy_io.c Tue Jan 10 11:52:46 2006 -0800
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/types.h>
+#include <asm/io.h>
+
+/**
+ * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
+ * @to: destination, in MMIO space (must be 32-bit aligned)
+ * @from: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ *
+ * Copy data from kernel space to MMIO space, in units of 32 bits at a
+ * time. Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count)
+{
+ u32 __iomem *dst = to;
+ const u32 *src = from;
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ __raw_writel(*src++, dst++);
+}
Most arches use the generic routine. x86_64 uses memcpy32 instead;
this is substantially faster, even over a bus that is much slower than
the CPU.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r b4863171295f -r 5673a186625f arch/alpha/Kconfig
--- a/arch/alpha/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/alpha/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -42,6 +42,10 @@
default y
config GENERIC_IRQ_PROBE
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/arm/Kconfig
--- a/arch/arm/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/arm/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -59,6 +59,10 @@
config GENERIC_BUST_SPINLOCK
bool
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
config ARCH_MAY_HAVE_PC_FDC
bool
diff -r b4863171295f -r 5673a186625f arch/arm26/Kconfig
--- a/arch/arm26/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/arm26/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -33,6 +33,10 @@
config FORCE_MAX_ZONEORDER
int
default 9
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
config RWSEM_GENERIC_SPINLOCK
bool
diff -r b4863171295f -r 5673a186625f arch/cris/Kconfig
--- a/arch/cris/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/cris/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -17,6 +17,10 @@
bool
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/frv/Kconfig
--- a/arch/frv/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/frv/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -24,6 +24,10 @@
config GENERIC_CALIBRATE_DELAY
bool
default n
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
config GENERIC_HARDIRQS
bool
diff -r b4863171295f -r 5673a186625f arch/h8300/Kconfig
--- a/arch/h8300/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/h8300/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -30,6 +30,10 @@
default n
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/i386/Kconfig
--- a/arch/i386/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/i386/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -34,6 +34,10 @@
default y
config GENERIC_IOMAP
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/ia64/Kconfig
--- a/arch/ia64/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/ia64/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -47,6 +47,10 @@
default y
config GENERIC_IOMAP
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/m32r/Kconfig
--- a/arch/m32r/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/m32r/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -25,6 +25,10 @@
default y
config GENERIC_IRQ_PROBE
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/m68k/Kconfig
--- a/arch/m68k/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/m68k/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -18,6 +18,10 @@
bool
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/m68knommu/Kconfig
--- a/arch/m68knommu/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/m68knommu/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -26,6 +26,10 @@
default n
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/mips/Kconfig
--- a/arch/mips/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/mips/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -800,6 +800,10 @@
bool
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/parisc/Kconfig
--- a/arch/parisc/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/parisc/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -37,6 +37,10 @@
config GENERIC_IRQ_PROBE
def_bool y
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
# unless you want to implement ACPI on PA-RISC ... ;-)
config PM
diff -r b4863171295f -r 5673a186625f arch/powerpc/Kconfig
--- a/arch/powerpc/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/powerpc/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -38,6 +38,10 @@
default y
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/ppc/Kconfig
--- a/arch/ppc/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/ppc/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -16,6 +16,10 @@
bool
config RWSEM_XCHGADD_ALGORITHM
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/s390/Kconfig
--- a/arch/s390/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/s390/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -20,6 +20,10 @@
config GENERIC_BUST_SPINLOCK
bool
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
mainmenu "Linux Kernel Configuration"
diff -r b4863171295f -r 5673a186625f arch/sh/Kconfig
--- a/arch/sh/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/sh/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -30,6 +30,10 @@
default y
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/sh64/Kconfig
--- a/arch/sh64/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/sh64/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -34,6 +34,10 @@
config GENERIC_ISA_DMA
bool
+
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
source init/Kconfig
diff -r b4863171295f -r 5673a186625f arch/sparc/Kconfig
--- a/arch/sparc/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/sparc/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -152,6 +152,10 @@
bool
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/sparc64/Kconfig
--- a/arch/sparc64/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/sparc64/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -166,6 +166,10 @@
bool
default y
+config GENERIC_RAW_MEMCPY_IO
+ bool
+ default y
+
choice
prompt "SPARC64 Huge TLB Page Size"
depends on HUGETLB_PAGE
diff -r b4863171295f -r 5673a186625f arch/v850/Kconfig
--- a/arch/v850/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/v850/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -17,6 +17,9 @@
bool
default n
config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f arch/xtensa/Kconfig
--- a/arch/xtensa/Kconfig Tue Jan 10 11:52:48 2006 -0800
+++ b/arch/xtensa/Kconfig Tue Jan 10 11:52:51 2006 -0800
@@ -27,6 +27,10 @@
default y
config GENERIC_HARDIRQS
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
diff -r b4863171295f -r 5673a186625f include/asm-alpha/io.h
--- a/include/asm-alpha/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-alpha/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -504,6 +504,8 @@
extern void memcpy_toio(volatile void __iomem *, const void *, long);
extern void _memset_c_io(volatile void __iomem *, unsigned long, long);
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
static inline void memset_io(volatile void __iomem *addr, u8 c, long len)
{
_memset_c_io(addr, 0x0101010101010101UL * c, len);
diff -r b4863171295f -r 5673a186625f include/asm-arm/io.h
--- a/include/asm-arm/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-arm/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -189,6 +189,8 @@
#define memset_io(c,v,l) _memset_io(__mem_pci(c),(v),(l))
#define memcpy_fromio(a,c,l) _memcpy_fromio((a),__mem_pci(c),(l))
#define memcpy_toio(c,a,l) _memcpy_toio(__mem_pci(c),(a),(l))
+
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
#define eth_io_copy_and_sum(s,c,l,b) \
eth_copy_and_sum((s),__mem_pci(c),(l),(b))
diff -r b4863171295f -r 5673a186625f include/asm-cris/io.h
--- a/include/asm-cris/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-cris/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -121,6 +121,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/*
* Again, CRIS does not require mem IO specific function.
*/
diff -r b4863171295f -r 5673a186625f include/asm-frv/io.h
--- a/include/asm-frv/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-frv/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -127,6 +127,8 @@
memcpy((void __force *) dst, src, count);
}
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
static inline uint8_t inb(unsigned long addr)
{
return __builtin_read8((void *)addr);
diff -r b4863171295f -r 5673a186625f include/asm-h8300/io.h
--- a/include/asm-h8300/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-h8300/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -209,6 +209,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define mmiowb()
#define inb(addr) ((h8300_buswidth(addr))?readw((addr) & ~1) & 0xff:readb(addr))
diff -r b4863171295f -r 5673a186625f include/asm-i386/io.h
--- a/include/asm-i386/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-i386/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -203,6 +203,8 @@
{
__memcpy((void __force *) dst, src, count);
}
+
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
diff -r b4863171295f -r 5673a186625f include/asm-ia64/io.h
--- a/include/asm-ia64/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-ia64/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -444,6 +444,8 @@
extern void memcpy_toio(volatile void __iomem *dst, const void *src, long n);
extern void memset_io(volatile void __iomem *s, int c, long n);
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define dma_cache_inv(_start,_size) do { } while (0)
#define dma_cache_wback(_start,_size) do { } while (0)
#define dma_cache_wback_inv(_start,_size) do { } while (0)
diff -r b4863171295f -r 5673a186625f include/asm-m32r/io.h
--- a/include/asm-m32r/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-m32r/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -216,6 +216,8 @@
memcpy((void __force *) dst, src, count);
}
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff -r b4863171295f -r 5673a186625f include/asm-m68knommu/io.h
--- a/include/asm-m68knommu/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-m68knommu/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -113,6 +113,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define inb(addr) readb(addr)
#define inw(addr) readw(addr)
#define inl(addr) readl(addr)
diff -r b4863171295f -r 5673a186625f include/asm-mips/io.h
--- a/include/asm-mips/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-mips/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -534,6 +534,8 @@
memcpy((void __force *) dst, src, count);
}
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/*
* Memory Mapped I/O
*/
diff -r b4863171295f -r 5673a186625f include/asm-parisc/io.h
--- a/include/asm-parisc/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-parisc/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -294,6 +294,8 @@
void memcpy_fromio(void *dst, const volatile void __iomem *src, int count);
void memcpy_toio(volatile void __iomem *dst, const void *src, int count);
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/* Support old drivers which don't ioremap.
* NB this interface is scheduled to disappear in 2.5
*/
diff -r b4863171295f -r 5673a186625f include/asm-powerpc/io.h
--- a/include/asm-powerpc/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-powerpc/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -64,6 +64,8 @@
#define memcpy_fromio(a,b,c) iSeries_memcpy_fromio((a), (b), (c))
#define memcpy_toio(a,b,c) iSeries_memcpy_toio((a), (b), (c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define inb(addr) readb(((void __iomem *)(long)(addr)))
#define inw(addr) readw(((void __iomem *)(long)(addr)))
#define inl(addr) readl(((void __iomem *)(long)(addr)))
diff -r b4863171295f -r 5673a186625f include/asm-ppc/io.h
--- a/include/asm-ppc/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-ppc/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -369,6 +369,8 @@
}
#endif
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(void __iomem *)(b),(c),(d))
/*
diff -r b4863171295f -r 5673a186625f include/asm-s390/io.h
--- a/include/asm-s390/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-s390/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -99,6 +99,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c))
#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#define inb_p(addr) readb(addr)
#define inb(addr) readb(addr)
diff -r b4863171295f -r 5673a186625f include/asm-sh/io.h
--- a/include/asm-sh/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-sh/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -177,6 +177,8 @@
extern void memcpy_toio(unsigned long, const void *, unsigned long);
extern void memset_io(unsigned long, int, unsigned long);
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/* SuperH on-chip I/O functions */
static __inline__ unsigned char ctrl_inb(unsigned long addr)
{
diff -r b4863171295f -r 5673a186625f include/asm-sh64/io.h
--- a/include/asm-sh64/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-sh64/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -125,6 +125,8 @@
void memcpy_toio(void __iomem *to, const void *from, long count);
void memcpy_fromio(void *to, void __iomem *from, long count);
+
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
#define mmiowb()
diff -r b4863171295f -r 5673a186625f include/asm-sparc/io.h
--- a/include/asm-sparc/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-sparc/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -239,6 +239,8 @@
#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz)
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
#ifdef __KERNEL__
/*
diff -r b4863171295f -r 5673a186625f include/asm-sparc64/io.h
--- a/include/asm-sparc64/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-sparc64/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -440,6 +440,8 @@
#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz)
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
static inline int check_signature(void __iomem *io_addr,
const unsigned char *signature,
int length)
diff -r b4863171295f -r 5673a186625f include/asm-v850/io.h
--- a/include/asm-v850/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-v850/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -130,6 +130,8 @@
#define memcpy_fromio(dst, src, len) memcpy (dst, (void *)src, len)
#define memcpy_toio(dst, src, len) memcpy ((void *)dst, src, len)
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff -r b4863171295f -r 5673a186625f include/asm-x86_64/io.h
--- a/include/asm-x86_64/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-x86_64/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -252,6 +252,14 @@
__memcpy_toio((unsigned long)to,from,len);
}
+#include <asm/string.h>
+
+/* See lib/raw_memcpy_io.c for kernel doc. */
+static inline void __raw_memcpy_toio32(void __iomem *dst, const void *src, size_t count)
+{
+ memcpy32((void __force *) dst, src, count);
+}
+
void memset_io(volatile void __iomem *a, int b, size_t c);
/*
diff -r b4863171295f -r 5673a186625f include/asm-xtensa/io.h
--- a/include/asm-xtensa/io.h Tue Jan 10 11:52:48 2006 -0800
+++ b/include/asm-xtensa/io.h Tue Jan 10 11:52:51 2006 -0800
@@ -159,6 +159,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
/* At this point the Xtensa doesn't provide byte swap instructions */
#ifdef __XTENSA_EB__
On Tuesday 10 January 2006 20:53, Bryan O'Sullivan wrote:
> Most arches use the generic routine. x86_64 uses memcpy32 instead;
> this is substantially faster, even over a bus that is much slower than
> the CPU.
So did you run numbers against the C implementation with -funroll-loops ?
What were the results?
-Andi
On Tue, 2006-01-10 at 21:08 +0100, Andi Kleen wrote:
> On Tuesday 10 January 2006 20:53, Bryan O'Sullivan wrote:
> > Most arches use the generic routine. x86_64 uses memcpy32 instead;
> > this is substantially faster, even over a bus that is much slower than
> > the CPU.
>
> So did you run numbers against the C implementation with -funroll-loops ?
> What were the results?
The C implementation is about 5% slower when copying over
HyperTransport.
<b
On Tuesday 10 January 2006 21:53, Bryan O'Sullivan wrote:
> Introduce an x86_64-specific memcpy32 routine. The routine is similar
> to memcpy, but is guaranteed to work in units of 32 bits at a time.
>
> Signed-off-by: Bryan O'Sullivan <[email protected]>
>
> diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/kernel/x8664_ksyms.c
> --- a/arch/x86_64/kernel/x8664_ksyms.c Tue Jan 10 11:52:46 2006 -0800
> +++ b/arch/x86_64/kernel/x8664_ksyms.c Tue Jan 10 11:52:48 2006 -0800
> @@ -164,6 +164,8 @@
> EXPORT_SYMBOL(memcpy);
> EXPORT_SYMBOL(__memcpy);
>
> +EXPORT_SYMBOL_GPL(memcpy32);
> +
> #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
> /* prototypes are wrong, these are assembly with custom calling functions */
> extern void rwsem_down_read_failed_thunk(void);
> diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/lib/Makefile
> --- a/arch/x86_64/lib/Makefile Tue Jan 10 11:52:46 2006 -0800
> +++ b/arch/x86_64/lib/Makefile Tue Jan 10 11:52:48 2006 -0800
> @@ -9,4 +9,4 @@
> lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
> usercopy.o getuser.o putuser.o \
> thunk.o clear_page.o copy_page.o bitstr.o bitops.o
> -lib-y += memcpy.o memmove.o memset.o copy_user.o
> +lib-y += memcpy.o memcpy32.o memmove.o memset.o copy_user.o
> diff -r 2d4af213d9c5 -r b4863171295f include/asm-x86_64/string.h
> --- a/include/asm-x86_64/string.h Tue Jan 10 11:52:46 2006 -0800
> +++ b/include/asm-x86_64/string.h Tue Jan 10 11:52:48 2006 -0800
> @@ -45,6 +45,9 @@
> #define __HAVE_ARCH_MEMMOVE
> void * memmove(void * dest,const void *src,size_t count);
>
> +/* copy data, 32 bits at a time */
> +void memcpy32(void *dst, const void *src, size_t count);
> +
> /* Use C out of line version for memcmp */
> #define memcmp __builtin_memcmp
> int memcmp(const void * cs,const void * ct,size_t count);
> diff -r 2d4af213d9c5 -r b4863171295f arch/x86_64/lib/memcpy32.S
> --- /dev/null Thu Jan 1 00:00:00 1970 +0000
> +++ b/arch/x86_64/lib/memcpy32.S Tue Jan 10 11:52:48 2006 -0800
> @@ -0,0 +1,39 @@
> +/*
> + * Copyright 2006 PathScale, Inc. All Rights Reserved.
> + *
> + * This file is free software; you can redistribute it and/or modify
> + * it under the terms of version 2 of the GNU General Public License
> + * as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software Foundation,
> + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
> + */
> +
> +/*
> + * Registers used below:
> + * dst - rdi
> + * src - rsi
> + * count - rdx
> + */
> +
> +/**
> + * memcpy32 - copy data, in units of 32 bits at a time
> + * @dst: destination (must be 32-bit aligned)
> + * @src: source (must be 32-bit aligned)
> + * @count: number of 32-bit quantities to copy
> + */
> + .globl memcpy32
> +memcpy32:
> + movl %edx,%ecx
> + shrl $1,%ecx
> + andl $1,%edx
> + rep movsq
> + movl %edx,%ecx
> + rep movsd
> + ret
movsq is not a 32bit move, it's a 64 bit one.
There are three possibilities here:
1) I misunderstand what memcpy32 means (I understand it like "it guarantees
that all accesses will be strictly 32bit")
2) On all current x86_64 hardware each 64bit access from/to
IO mapped addresses is always converted to two 32bit accesses.
3) code is buggy
If it is (1) or (2), consider adding a comment to clear future
reader's confusion.
--
vda
On Thu, 2006-01-12 at 10:38 +0200, Denis Vlasenko wrote:
> 2) On all current x86_64 hardware each 64bit access from/to
> IO mapped addresses is always converted to two 32bit accesses.
This is true for 64-bit writes over Hypertransport (reads don't get
split up this way), but not for PCI-Express memory writes, which remain
atomic 64-bit. I'll be converting the 64-bit accesses to 32-bit, as you
and Andi suggested.
<b
On Thu, Jan 12, 2006 at 08:04:41AM -0800, Bryan O'Sullivan wrote:
> This is true for 64-bit writes over Hypertransport
is this something that will always be or just something current
hardware does?
On Friday 13 January 2006 11:56, Chris Wedgwood wrote:
> On Thu, Jan 12, 2006 at 08:04:41AM -0800, Bryan O'Sullivan wrote:
>
> > This is true for 64-bit writes over Hypertransport
>
> is this something that will always be or just something current
> hardware does?
Yes, why risking that things will go wrong?
Also you'll get shorter code. Instead of
> + ????.globl memcpy32
> +memcpy32:
> +?????movl %edx,%ecx
> +?????shrl $1,%ecx
> +?????andl $1,%edx
> +?????rep movsq
> +?????movl %edx,%ecx
> +?????rep movsd
> +?????ret
you need just
.globl memcpy32
memcpy32:
movl %edx,%ecx
rep movsd
ret
With properly written inlined asms code will be
reduced to just "rep movsd".
--
vda
On Fri, 2006-01-13 at 12:24 +0200, Denis Vlasenko wrote:
> you need just
>
> .globl memcpy32
> memcpy32:
> movl %edx,%ecx
> rep movsd
> ret
This is what the current version of the patches in -mm does.
<b