Following some discussion with Roland, and patterned after the style
anointed by Linus last week, here is a new version of the 32-bit MMIO
copy routine needed by our InfiniPath device.
The name of the routine has changed from memcpy_toio32 to
__raw_memcpy_toio32. This reflects the basic nature of the routine;
it dodes not guarantee the order in which writes are performed, nor does
it perform a memory barrier after it is done.
The reason for this is that our chip treats the first and last writes
to some MMIO regions specially; our driver performs those directly using
writel, and uses __raw_memcpy_toio32 for the bits in between.
Regarding the specialised x86_64 implementation, Andi Kleen asked me
to perform some measurements of its performance impact. It makes a
difference of about 5% in performance on moderately large copies over
the HyperTransport bus, compared to the generic implementation.
<b
Introduce an x86_64-specific memcpy32 routine. The routine is similar
to memcpy, but is guaranteed to work in units of 32 bits at a time.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r d286502c3b3c -r 33790477a163 arch/x86_64/kernel/x8664_ksyms.c
--- a/arch/x86_64/kernel/x8664_ksyms.c Fri Jan 6 12:25:00 2006 -0800
+++ b/arch/x86_64/kernel/x8664_ksyms.c Fri Jan 6 12:25:02 2006 -0800
@@ -164,6 +164,8 @@
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL_GPL(memcpy32);
+
#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
/* prototypes are wrong, these are assembly with custom calling functions */
extern void rwsem_down_read_failed_thunk(void);
diff -r d286502c3b3c -r 33790477a163 arch/x86_64/lib/Makefile
--- a/arch/x86_64/lib/Makefile Fri Jan 6 12:25:00 2006 -0800
+++ b/arch/x86_64/lib/Makefile Fri Jan 6 12:25:02 2006 -0800
@@ -9,4 +9,4 @@
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += memcpy.o memcpy32.o memmove.o memset.o copy_user.o
diff -r d286502c3b3c -r 33790477a163 include/asm-x86_64/string.h
--- a/include/asm-x86_64/string.h Fri Jan 6 12:25:00 2006 -0800
+++ b/include/asm-x86_64/string.h Fri Jan 6 12:25:02 2006 -0800
@@ -45,6 +45,15 @@
#define __HAVE_ARCH_MEMMOVE
void * memmove(void * dest,const void *src,size_t count);
+/*
+ * memcpy32 - copy data, 32 bits at a time
+ *
+ * @dst: destination (must be 32-bit aligned)
+ * @src: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ */
+void memcpy32(void *dst, const void *src, size_t count);
+
/* Use C out of line version for memcmp */
#define memcmp __builtin_memcmp
int memcmp(const void * cs,const void * ct,size_t count);
diff -r d286502c3b3c -r 33790477a163 arch/x86_64/lib/memcpy32.S
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/arch/x86_64/lib/memcpy32.S Fri Jan 6 12:25:02 2006 -0800
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy32 - Copy a memory block, 32 bits at a time.
+ *
+ * This routine does not return anything.
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count (32-bit quantities to copy)
+ */
+
+ .globl memcpy32
+memcpy32:
+ movl %edx,%ecx
+ shrl $1,%ecx
+ andl $1,%edx
+ rep movsq
+ movl %edx,%ecx
+ rep movsd
+ ret
Most arches use the <asm-generic/raw_memcpy_toio32.h> routine, while
x86_64 uses memcpy32, which is substantially faster, even on a bus
that is substantially slower than the CPU.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r 33790477a163 -r 9e06b832c26c include/asm-alpha/io.h
--- a/include/asm-alpha/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-alpha/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -504,6 +504,8 @@
extern void memcpy_toio(volatile void __iomem *, const void *, long);
extern void _memset_c_io(volatile void __iomem *, unsigned long, long);
+#include <asm-generic/raw_memcpy_toio32.h>
+
static inline void memset_io(volatile void __iomem *addr, u8 c, long len)
{
_memset_c_io(addr, 0x0101010101010101UL * c, len);
diff -r 33790477a163 -r 9e06b832c26c include/asm-arm/io.h
--- a/include/asm-arm/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-arm/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -184,6 +184,8 @@
#define memset_io(c,v,l) _memset_io(__mem_pci(c),(v),(l))
#define memcpy_fromio(a,c,l) _memcpy_fromio((a),__mem_pci(c),(l))
#define memcpy_toio(c,a,l) _memcpy_toio(__mem_pci(c),(a),(l))
+
+#include <asm-generic/raw_memcpy_toio32.h>
#define eth_io_copy_and_sum(s,c,l,b) \
eth_copy_and_sum((s),__mem_pci(c),(l),(b))
diff -r 33790477a163 -r 9e06b832c26c include/asm-cris/io.h
--- a/include/asm-cris/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-cris/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -121,6 +121,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
/*
* Again, CRIS does not require mem IO specific function.
*/
diff -r 33790477a163 -r 9e06b832c26c include/asm-frv/io.h
--- a/include/asm-frv/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-frv/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -124,6 +124,8 @@
memcpy((void __force *) dst, src, count);
}
+#include <asm-generic/raw_memcpy_toio32.h>
+
static inline uint8_t inb(unsigned long addr)
{
return __builtin_read8((void *)addr);
diff -r 33790477a163 -r 9e06b832c26c include/asm-h8300/io.h
--- a/include/asm-h8300/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-h8300/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -209,6 +209,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define mmiowb()
#define inb(addr) ((h8300_buswidth(addr))?readw((addr) & ~1) & 0xff:readb(addr))
diff -r 33790477a163 -r 9e06b832c26c include/asm-i386/io.h
--- a/include/asm-i386/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-i386/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -203,6 +203,8 @@
{
__memcpy((void __force *) dst, src, count);
}
+
+#include <asm-generic/raw_memcpy_toio32.h>
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
diff -r 33790477a163 -r 9e06b832c26c include/asm-ia64/io.h
--- a/include/asm-ia64/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-ia64/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -443,6 +443,8 @@
extern void memcpy_toio(volatile void __iomem *dst, const void *src, long n);
extern void memset_io(volatile void __iomem *s, int c, long n);
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define dma_cache_inv(_start,_size) do { } while (0)
#define dma_cache_wback(_start,_size) do { } while (0)
#define dma_cache_wback_inv(_start,_size) do { } while (0)
diff -r 33790477a163 -r 9e06b832c26c include/asm-m32r/io.h
--- a/include/asm-m32r/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-m32r/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -216,6 +216,8 @@
memcpy((void __force *) dst, src, count);
}
+#include <asm-generic/raw_memcpy_toio32.h>
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff -r 33790477a163 -r 9e06b832c26c include/asm-m68knommu/io.h
--- a/include/asm-m68knommu/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-m68knommu/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -113,6 +113,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define inb(addr) readb(addr)
#define inw(addr) readw(addr)
#define inl(addr) readl(addr)
diff -r 33790477a163 -r 9e06b832c26c include/asm-mips/io.h
--- a/include/asm-mips/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-mips/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -534,6 +534,8 @@
memcpy((void __force *) dst, src, count);
}
+#include <asm-generic/raw_memcpy_toio32.h>
+
/*
* Memory Mapped I/O
*/
diff -r 33790477a163 -r 9e06b832c26c include/asm-parisc/io.h
--- a/include/asm-parisc/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-parisc/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -294,6 +294,8 @@
void memcpy_fromio(void *dst, const volatile void __iomem *src, int count);
void memcpy_toio(volatile void __iomem *dst, const void *src, int count);
+#include <asm-generic/raw_memcpy_toio32.h>
+
/* Support old drivers which don't ioremap.
* NB this interface is scheduled to disappear in 2.5
*/
diff -r 33790477a163 -r 9e06b832c26c include/asm-powerpc/io.h
--- a/include/asm-powerpc/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-powerpc/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -63,6 +63,8 @@
#define memcpy_fromio(a,b,c) iSeries_memcpy_fromio((a), (b), (c))
#define memcpy_toio(a,b,c) iSeries_memcpy_toio((a), (b), (c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define inb(addr) readb(((void __iomem *)(long)(addr)))
#define inw(addr) readw(((void __iomem *)(long)(addr)))
#define inl(addr) readl(((void __iomem *)(long)(addr)))
diff -r 33790477a163 -r 9e06b832c26c include/asm-ppc/io.h
--- a/include/asm-ppc/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-ppc/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -367,6 +367,8 @@
}
#endif
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(void __iomem *)(b),(c),(d))
/*
diff -r 33790477a163 -r 9e06b832c26c include/asm-s390/io.h
--- a/include/asm-s390/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-s390/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -99,6 +99,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),__io_virt(b),(c))
#define memcpy_toio(a,b,c) memcpy(__io_virt(a),(b),(c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
#define inb_p(addr) readb(addr)
#define inb(addr) readb(addr)
diff -r 33790477a163 -r 9e06b832c26c include/asm-sh/io.h
--- a/include/asm-sh/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-sh/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -177,6 +177,8 @@
extern void memcpy_toio(unsigned long, const void *, unsigned long);
extern void memset_io(unsigned long, int, unsigned long);
+#include <asm-generic/raw_memcpy_toio32.h>
+
/* SuperH on-chip I/O functions */
static __inline__ unsigned char ctrl_inb(unsigned long addr)
{
diff -r 33790477a163 -r 9e06b832c26c include/asm-sh64/io.h
--- a/include/asm-sh64/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-sh64/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -125,6 +125,8 @@
void memcpy_toio(void __iomem *to, const void *from, long count);
void memcpy_fromio(void *to, void __iomem *from, long count);
+
+#include <asm-generic/raw_memcpy_toio32.h>
#define mmiowb()
diff -r 33790477a163 -r 9e06b832c26c include/asm-sparc/io.h
--- a/include/asm-sparc/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-sparc/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -239,6 +239,8 @@
#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz)
+#include <asm-generic/raw_memcpy_toio32.h>
+
#ifdef __KERNEL__
/*
diff -r 33790477a163 -r 9e06b832c26c include/asm-sparc64/io.h
--- a/include/asm-sparc64/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-sparc64/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -440,6 +440,8 @@
#define memcpy_toio(d,s,sz) _memcpy_toio(d,s,sz)
+#include <asm-generic/raw_memcpy_toio32.h>
+
static inline int check_signature(void __iomem *io_addr,
const unsigned char *signature,
int length)
diff -r 33790477a163 -r 9e06b832c26c include/asm-v850/io.h
--- a/include/asm-v850/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-v850/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -130,6 +130,8 @@
#define memcpy_fromio(dst, src, len) memcpy (dst, (void *)src, len)
#define memcpy_toio(dst, src, len) memcpy ((void *)dst, src, len)
+#include <asm-generic/raw_memcpy_toio32.h>
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff -r 33790477a163 -r 9e06b832c26c include/asm-x86_64/io.h
--- a/include/asm-x86_64/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-x86_64/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -252,6 +252,13 @@
__memcpy_toio((unsigned long)to,from,len);
}
+#include <asm/string.h>
+
+static inline void __raw_memcpy_toio32(void __iomem *dst, const void *src, size_t count)
+{
+ memcpy32((void __force *) dst, src, count);
+}
+
void memset_io(volatile void __iomem *a, int b, size_t c);
/*
diff -r 33790477a163 -r 9e06b832c26c include/asm-xtensa/io.h
--- a/include/asm-xtensa/io.h Fri Jan 6 12:25:02 2006 -0800
+++ b/include/asm-xtensa/io.h Fri Jan 6 12:25:04 2006 -0800
@@ -159,6 +159,8 @@
#define memcpy_fromio(a,b,c) memcpy((a),(void *)(b),(c))
#define memcpy_toio(a,b,c) memcpy((void *)(a),(b),(c))
+#include <asm-generic/raw_memcpy_toio32.h>
+
/* At this point the Xtensa doesn't provide byte swap instructions */
#ifdef __XTENSA_EB__
This arch-independent routine copies data to a memory-mapped I/O region,
using 32-bit accesses. It does not guarantee access ordering, nor does
it perform a memory barrier afterwards. This style of access is required
by some devices.
Signed-off-by: Bryan O'Sullivan <[email protected]>
diff -r ddf8ff83e4ac -r d286502c3b3c include/asm-generic/raw_memcpy_toio32.h
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/include/asm-generic/raw_memcpy_toio32.h Fri Jan 6 12:25:00 2006 -0800
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef _ASM_GENERIC_RAW_MEMCPYTOIO32_H
+#define _ASM_GENERIC_RAW_MEMCPYTOIO32_H
+
+/*
+ * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
+ *
+ * Order of access is not guaranteed, nor is a memory barrier performed
+ * afterwards. This is an arch-independent generic implementation.
+ *
+ * @to: destination, in MMIO space (must be 32-bit aligned)
+ * @from: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ */
+static inline void __raw_memcpy_toio32(void __iomem *to, const void *from,
+ size_t count)
+{
+ u32 __iomem *dst = d;
+ const u32 *src = s;
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ __raw_writel(*src++, dst++);
+}
+
+#endif // _ASM_GENERIC_RAW_MEMCPYTOIO32_H
"Bryan O'Sullivan" <[email protected]> wrote:
>
> This arch-independent routine copies data to a memory-mapped I/O region,
> using 32-bit accesses. It does not guarantee access ordering, nor does
> it perform a memory barrier afterwards. This style of access is required
> by some devices.
Not providing orderingor barriers makes this a rather risky thing to export
- people might use it, find their driver "works" on one architecture, but
fails on another.
I guess the "__" is a decent warning of this, and the patch anticipates a
higher-level raw_memcpy_toio32() which implements those things, yes?
How come __raw_memcpy_toio32() is inlined?
Bryan> This arch-independent routine copies data to a
Bryan> memory-mapped I/O region, using 32-bit accesses. It does
Bryan> not guarantee access ordering, nor does it perform a memory
Bryan> barrier afterwards. This style of access is required by
Bryan> some devices.
Andrew> Not providing orderingor barriers makes this a rather
Andrew> risky thing to export - people might use it, find their
Andrew> driver "works" on one architecture, but fails on another.
Andrew> I guess the "__" is a decent warning of this, and the
Andrew> patch anticipates a higher-level raw_memcpy_toio32() which
Andrew> implements those things, yes?
I suggested the __raw_ name to parallel __raw_writel and friends. The
name for the version that ends with a write barrier would presumably
be just "memcpy_toio32()". Bryan could easily add that his patch as
well, even though the Pathscale driver will only use the __raw_ version.
Andrew> How come __raw_memcpy_toio32() is inlined?
That is a good question, especially since the optimized
x86_64-specific version is out-of-line. I suspect the answer is
mainly that that's the easiest way to stick it in a header in
include/asm-generic. I think it would be worth working a little
harder and making the generic version out-of-line.
On Tue, 2006-01-10 at 01:18 -0800, Andrew Morton wrote:
> "Bryan O'Sullivan" <[email protected]> wrote:
> >
> > This arch-independent routine copies data to a memory-mapped I/O region,
> > using 32-bit accesses. It does not guarantee access ordering, nor does
> > it perform a memory barrier afterwards. This style of access is required
> > by some devices.
>
> Not providing orderingor barriers makes this a rather risky thing to export
> - people might use it, find their driver "works" on one architecture, but
> fails on another.
The kdoc comments for the routine clearly state these limitations, so I
hope that between the comments and the naming, the risk is minimal.
> I guess the "__" is a decent warning of this, and the patch anticipates a
> higher-level raw_memcpy_toio32() which implements those things, yes?
It leaves room for it, yes, though I don't see much reason to add such a
routine until a driver specifically needs it.
> How come __raw_memcpy_toio32() is inlined?
There's no technical reason for it to be. I'm simply trying to find an
acceptable way to get the code into the tree that accommodates per-arch
implementations.
So let me rewind a little and state my problem.
My driver needs a copy routine that works in 32-bit chunks, writes to
MMIO space, doesn't care about ordering, and doesn't guarantee a memory
barrier. It also very much wants individual arches to be able to
implement this routine themselves; even though it's a small, simple
loop, we've benchmarked our x86_64 version as giving us 5% better
performance overall (i.e. visible to apps, not just microbenchmarks)
when doing reasonably large copies. I'd expect other arches to have
similar benefits.
I'm more than willing to recast the code into whatever form makes people
happy, but it would be greatly beneficial to me if it also met my
requirements.
So far, my attempts have been thus:
* out of line, with __HAVE_ARCH_XXX to avoid bloat on arches that
have specialised implementations - __HAVE_ARCH_XXX is out of
style
* out of line, unconditional - made people unhappy on bloat
grounds, since arches that have specialised implementations end
up with an extra unneeded routine
* inline, apparently in Linus's preferred style - an inline that
isn't really necessary
For a routine whose C implementation is six lines long, it's had an
impressive submission history :-)
What do you suggest I try next? I'll be happy to try a different tack.
<b
On Tue, 2006-01-10 at 06:55 -0800, Roland Dreier wrote:
> That is a good question, especially since the optimized
> x86_64-specific version is out-of-line. I suspect the answer is
> mainly that that's the easiest way to stick it in a header in
> include/asm-generic.
Yes, this is correct.
> I think it would be worth working a little
> harder and making the generic version out-of-line.
I'm fine with doing that, but I wonder what an appropriate way to do it
would be.
Really, we'd like the generic implementation to be declared in
asm-generic and defined in lib. Each arch that needed the generic
version could then have its arch/XXX/lib/Makefile modified to pull in
the generic version from lib, while arches that had special versions
could remain unencumbered.
The only problem with this is that it's an unusual approach, so I don't
have any obvious examples to copy. The closest I can think of is
arch/x86_64/kernel/Makefile, which pulls in routines from the i386 tree
like this:
bootflag-y += ../../i386/kernel/bootflag.o
So say arch/ia64/lib/Makefile, for example, could have a line like this:
obj-y += ../../../lib/raw_memcpy_toio32.o
Sam, do you think this is safe to do in generalwith respect to kbuild?
Additionally, does it meet everyone's needs in terms of being generic,
safe, in good style, and keeping bloat to a minimum?
<b
On Tue, 2006-01-10 at 08:07 -0800, Bryan O'Sullivan wrote:
> The only problem with this is that it's an unusual approach, so I don't
> have any obvious examples to copy.
It was easy to hack up a quick patch to try the idea out.
This single-arch patch is for review purposes only; I compile-tested it
on i386, and it has no obvious problems at compile- or link-time.
If it looks OK, I have a complete patch set that affects all arches.
The parts that affect arch/*/lib/Makefile and include/asm-*/io.h are
essentially identical in all cases.
<b
diff -r 48616306e7bd include/asm-generic/raw_memcpy_toio32.h
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/include/asm-generic/raw_memcpy_toio32.h Tue Jan 10 08:44:30 2006 -0800
@@ -0,0 +1,16 @@
+#ifndef _ASM_GENERIC_RAW_MEMCPYTOIO32_H
+#define _ASM_GENERIC_RAW_MEMCPYTOIO32_H
+
+/*
+ * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
+ *
+ * Order of access is not guaranteed, nor is a memory barrier performed
+ * afterwards. This is an arch-independent generic implementation.
+ *
+ * @to: destination, in MMIO space (must be 32-bit aligned)
+ * @from: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ */
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
+#endif // _ASM_GENERIC_RAW_MEMCPYTOIO32_H
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/lib/raw_memcpy_toio32.c Tue Jan 10 08:44:30 2006 -0800
@@ -0,0 +1,13 @@
+#include <linux/types.h>
+#include <asm/io.h>
+#include <asm-generic/raw_memcpy_toio32.h>
+
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count)
+{
+ u32 __iomem *dst = to;
+ const u32 *src = from;
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ __raw_writel(*src++, dst++);
+}
--- a/arch/i386/lib/Makefile Tue Jan 10 08:44:35 2006 -0800
+++ b/arch/i386/lib/Makefile Tue Jan 10 08:44:44 2006 -0800
@@ -5,5 +5,6 @@
lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
bitops.o
+lib-y += ../../../lib/raw_memcpy_toio32.o
lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
--- a/include/asm-i386/io.h Tue Jan 10 08:44:35 2006 -0800
+++ b/include/asm-i386/io.h Tue Jan 10 08:44:44 2006 -0800
@@ -203,6 +203,8 @@
{
__memcpy((void __force *) dst, src, count);
}
+
+#include <asm-generic/raw_memcpy_toio32.h>
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
On Tue, Jan 10, 2006 at 08:07:56AM -0800, Bryan O'Sullivan wrote:
> I'm fine with doing that, but I wonder what an appropriate way to do it
> would be.
>
> Really, we'd like the generic implementation to be declared in
> asm-generic and defined in lib. Each arch that needed the generic
> version could then have its arch/XXX/lib/Makefile modified to pull in
> the generic version from lib, while arches that had special versions
> could remain unencumbered.
Or add a CONFIG_GENERIC_MEMCPY_IO that's non-uservisible and just set
by all the architectures that don't provide their own version. And once
we're at that level of complexity we should really add the _fromio version
aswell ;-)
On Tue, 2006-01-10 at 17:07 +0000, Christoph Hellwig wrote:
> Or add a CONFIG_GENERIC_MEMCPY_IO that's non-uservisible and just set
> by all the architectures that don't provide their own version.
That's only very slightly different from the RFC patch I just posted.
If you think it would be a preferable way to express it, that's fine by
me. It would at least keep me from crapping on every arch's
lib/Makefile.
> And once
> we're at that level of complexity we should really add the _fromio version
> aswell
I'm already being raked over the coals enough for a routine that has an
actual user waiting in the wings :-)
<b
On Tue, 2006-01-10 at 17:07 +0000, Christoph Hellwig wrote:
> Or add a CONFIG_GENERIC_MEMCPY_IO that's non-uservisible and just set
> by all the architectures that don't provide their own version.
Here's another i386-only review patch that does essentially that. It
looks cleaner to me than my previous patch from this morning.
(Copyrights and other arches omitted, for clarity.)
What do you think?
<b
diff -r 48616306e7bd lib/Makefile
--- a/lib/Makefile Tue Jan 10 10:41:42 2006 +0800
+++ b/lib/Makefile Tue Jan 10 09:32:53 2006 -0800
@@ -21,6 +21,7 @@
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
+lib-$(CONFIG_GENERIC_RAW_MEMCPY_IO) += raw_memcpy_io.o
obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/include/asm-generic/raw_memcpy_io.h Tue Jan 10 09:32:53 2006 -0800
@@ -0,0 +1,16 @@
+#ifndef _ASM_GENERIC_RAW_MEMCPY_IO_H
+#define _ASM_GENERIC_RAW_MEMCPY_IO_H
+
+/*
+ * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
+ *
+ * Order of access is not guaranteed, nor is a memory barrier performed
+ * afterwards. This is an arch-independent generic implementation.
+ *
+ * @to: destination, in MMIO space (must be 32-bit aligned)
+ * @from: source (must be 32-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ */
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
+
+#endif // _ASM_GENERIC_RAW_MEMCPY_IO_H
diff -r 48616306e7bd lib/raw_memcpy_io.c
--- /dev/null Thu Jan 1 00:00:00 1970 +0000
+++ b/lib/raw_memcpy_io.c Tue Jan 10 09:32:53 2006 -0800
@@ -0,0 +1,13 @@
+#include <linux/types.h>
+#include <asm/io.h>
+#include <asm-generic/raw_memcpy_io.h>
+
+void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count)
+{
+ u32 __iomem *dst = to;
+ const u32 *src = from;
+ size_t i;
+
+ for (i = 0; i < count; i++)
+ __raw_writel(*src++, dst++);
+}
--- a/include/asm-i386/io.h Tue Jan 10 09:32:58 2006 -0800
+++ b/include/asm-i386/io.h Tue Jan 10 09:35:16 2006 -0800
@@ -203,6 +203,8 @@
{
__memcpy((void __force *) dst, src, count);
}
+
+#include <asm-generic/raw_memcpy_io.h>
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
--- a/arch/i386/Kconfig Tue Jan 10 09:32:58 2006 -0800
+++ b/arch/i386/Kconfig Tue Jan 10 09:35:16 2006 -0800
@@ -34,6 +34,10 @@
default y
config GENERIC_IOMAP
+ bool
+ default y
+
+config GENERIC_RAW_MEMCPY_IO
bool
default y
On Tue, Jan 10, 2006 at 09:49:46AM -0800, Bryan O'Sullivan wrote:
> On Tue, 2006-01-10 at 17:07 +0000, Christoph Hellwig wrote:
>
> > Or add a CONFIG_GENERIC_MEMCPY_IO that's non-uservisible and just set
> > by all the architectures that don't provide their own version.
>
> Here's another i386-only review patch that does essentially that. It
> looks cleaner to me than my previous patch from this morning.
> (Copyrights and other arches omitted, for clarity.)
>
> What do you think?
>
> <b
>
> diff -r 48616306e7bd lib/Makefile
> --- a/lib/Makefile Tue Jan 10 10:41:42 2006 +0800
> +++ b/lib/Makefile Tue Jan 10 09:32:53 2006 -0800
> @@ -21,6 +21,7 @@
> lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
> lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
> lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
> +lib-$(CONFIG_GENERIC_RAW_MEMCPY_IO) += raw_memcpy_io.o
> obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
> obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
>
> --- /dev/null Thu Jan 1 00:00:00 1970 +0000
> +++ b/include/asm-generic/raw_memcpy_io.h Tue Jan 10 09:32:53 2006 -0800
> @@ -0,0 +1,16 @@
> +#ifndef _ASM_GENERIC_RAW_MEMCPY_IO_H
> +#define _ASM_GENERIC_RAW_MEMCPY_IO_H
> +
> +/*
> + * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
> + *
> + * Order of access is not guaranteed, nor is a memory barrier performed
> + * afterwards. This is an arch-independent generic implementation.
> + *
> + * @to: destination, in MMIO space (must be 32-bit aligned)
> + * @from: source (must be 32-bit aligned)
> + * @count: number of 32-bit quantities to copy
> + */
This should be in the implementation file, not near the prototype.
And needs to start with /** to be valid kernel doc.
> +void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
and without that comment I'd suggest just adding this to every asm/io.h
instead of an asm-generic header for just one prototype.
On Tue, 2006-01-10 at 17:51 +0000, Christoph Hellwig wrote:
> This should be in the implementation file, not near the prototype.
> And needs to start with /** to be valid kernel doc.
OK, thanks.
> > +void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
>
> and without that comment I'd suggest just adding this to every asm/io.h
> instead of an asm-generic header for just one prototype.
OK, that header file will vanish.
<b
On Tue, 10 Jan 2006, Christoph Hellwig wrote:
> On Tue, Jan 10, 2006 at 09:49:46AM -0800, Bryan O'Sullivan wrote:
> > On Tue, 2006-01-10 at 17:07 +0000, Christoph Hellwig wrote:
> >
> > > Or add a CONFIG_GENERIC_MEMCPY_IO that's non-uservisible and just set
> > > by all the architectures that don't provide their own version.
> >
> > diff -r 48616306e7bd lib/Makefile
> > --- a/lib/Makefile Tue Jan 10 10:41:42 2006 +0800
> > +++ b/lib/Makefile Tue Jan 10 09:32:53 2006 -0800
> > @@ -21,6 +21,7 @@
> > lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
> > lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
> > lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
> > +lib-$(CONFIG_GENERIC_RAW_MEMCPY_IO) += raw_memcpy_io.o
> > obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
> > obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
> >
> > --- /dev/null Thu Jan 1 00:00:00 1970 +0000
> > +++ b/include/asm-generic/raw_memcpy_io.h Tue Jan 10 09:32:53 2006 -0800
> > @@ -0,0 +1,16 @@
> > +#ifndef _ASM_GENERIC_RAW_MEMCPY_IO_H
> > +#define _ASM_GENERIC_RAW_MEMCPY_IO_H
> > +
> > +/*
> > + * __raw_memcpy_toio32 - copy data to MMIO space, in 32-bit units
> > + *
> > + * Order of access is not guaranteed, nor is a memory barrier performed
> > + * afterwards. This is an arch-independent generic implementation.
> > + *
> > + * @to: destination, in MMIO space (must be 32-bit aligned)
> > + * @from: source (must be 32-bit aligned)
> > + * @count: number of 32-bit quantities to copy
> > + */
>
> This should be in the implementation file, not near the prototype.
> And needs to start with /** to be valid kernel doc.
and the function args need to follow the first line, then a blank ('*')
line, then the longer function description/notes.
> > +void __raw_memcpy_toio32(void __iomem *to, const void *from, size_t count);
>
> and without that comment I'd suggest just adding this to every asm/io.h
> instead of an asm-generic header for just one prototype.
--
~Randy
On Tue, Jan 10, 2006 at 08:07:56AM -0800, Bryan O'Sullivan wrote:
> The only problem with this is that it's an unusual approach, so I don't
> have any obvious examples to copy. The closest I can think of is
> arch/x86_64/kernel/Makefile, which pulls in routines from the i386 tree
> like this:
>
> bootflag-y += ../../i386/kernel/bootflag.o
>
> So say arch/ia64/lib/Makefile, for example, could have a line like this:
>
> obj-y += ../../../lib/raw_memcpy_toio32.o
>
> Sam, do you think this is safe to do in generalwith respect to kbuild?
It is safe if you pull in .o files from a directory that you otherwise
does not visit. But pulling in .o files that can/will be build
by another Makkefile is doomed.
But seeing other mails in this thread the final solution does not use
this anyway.
Sam
"Bryan O'Sullivan" <[email protected]> wrote:
>
> OK, that header file will vanish.
>
It's kinda fun playing Brian along like this ;)
One option is to just stick the thing in an existing lib/ or kernel/ file
and mark it __attribute__((weak)). That way architectures can override it
for free with no ifdefs, no Makefile trickery, no Kconfig trickery, etc.
attribute(weak) is creepily useful - I keep waiting for something to go
wrong with it.
On Tue, 2006-01-10 at 14:05 -0800, Andrew Morton wrote:
> It's kinda fun playing Brian along like this ;)
A regular barrel of monkeys, indeed...
> One option is to just stick the thing in an existing lib/ or kernel/ file
> and mark it __attribute__((weak)). That way architectures can override it
> for free with no ifdefs, no Makefile trickery, no Kconfig trickery, etc.
I'm easy. Would you prefer to take that, or the Kconfig-trickery-based
patch series I already posted earlier?
<b
"Bryan O'Sullivan" <[email protected]> wrote:
>
> On Tue, 2006-01-10 at 14:05 -0800, Andrew Morton wrote:
>
> > It's kinda fun playing Brian along like this ;)
>
> A regular barrel of monkeys, indeed...
>
> > One option is to just stick the thing in an existing lib/ or kernel/ file
> > and mark it __attribute__((weak)). That way architectures can override it
> > for free with no ifdefs, no Makefile trickery, no Kconfig trickery, etc.
>
> I'm easy. Would you prefer to take that, or the Kconfig-trickery-based
> patch series I already posted earlier?
>
Unless someone can think of a problem with attribute(weak), I think you'll
find that it's the simplest-by-far solution.
On Tue, 2006-01-10 at 15:32 -0800, Andrew Morton wrote:
> Unless someone can think of a problem with attribute(weak), I think you'll
> find that it's the simplest-by-far solution.
The only problem I can see with this is that on x86_64 and other
platforms that reimplement the routine as an inline function, I think
we'll be left with a small hunk of dead code in the form of the
out-of-line version in lib/ that never gets referenced.
Is this something people care about? If so, I could turn the config
setting in my last patch on its head, and use it to indicate that the
routine should *not* be built for a particular arch. This would make
lib/Makefile slightly uglier, but would avoid cluttering every other
arch's lib/Makefile and Kconfig file.
<b
On Wed, Jan 11, 2006 at 09:20:32AM -0800, Bryan O'Sullivan wrote:
> On Tue, 2006-01-10 at 15:32 -0800, Andrew Morton wrote:
>
> > Unless someone can think of a problem with attribute(weak), I think you'll
> > find that it's the simplest-by-far solution.
>
> The only problem I can see with this is that on x86_64 and other
> platforms that reimplement the routine as an inline function, I think
> we'll be left with a small hunk of dead code in the form of the
> out-of-line version in lib/ that never gets referenced.
If it is not referenced the linker should not pull it in from lib.a -
no?
Sam
Sam Ravnborg <[email protected]> wrote:
>
> On Wed, Jan 11, 2006 at 09:20:32AM -0800, Bryan O'Sullivan wrote:
> > On Tue, 2006-01-10 at 15:32 -0800, Andrew Morton wrote:
> >
> > > Unless someone can think of a problem with attribute(weak), I think you'll
> > > find that it's the simplest-by-far solution.
> >
> > The only problem I can see with this is that on x86_64 and other
> > platforms that reimplement the routine as an inline function, I think
> > we'll be left with a small hunk of dead code in the form of the
> > out-of-line version in lib/ that never gets referenced.
Sure, attribute(weak) assumes that nobody want to implement the thing as an
inline. Are yu sure that we want to do that? After all, copying memory is
sloooooow, and copying IO memory probably has even more "o"'s.
> If it is not referenced the linker should not pull it in from lib.a -
> no?
If it's in it's own .o file. But it might be referenced from modules, so
we must always include it in vmlinux if we compiled it.
On Wed, 2006-01-11 at 09:30 -0800, Andrew Morton wrote:
> Sure, attribute(weak) assumes that nobody want to implement the thing as an
> inline. Are yu sure that we want to do that?
I'll take a look at whether the extra call/ret indirection affects
performance in a measurable fashion.
<b
Bryan> I'll take a look at whether the extra call/ret indirection
Bryan> affects performance in a measurable fashion.
Your current implementation is out-of-line, right?
I would be surprised if calling a function has any overhead on x86_64,
since the function call is a jump that can be predicted perfectly.
The only issue is the code to shuffle values into the right registers.
- R.
On Wed, 2006-01-11 at 10:49 -0800, Roland Dreier wrote:
> Your current implementation is out-of-line, right?
The memcpy32 routine is, but __raw_memcpy_toio32 simply calls it, so we
have two jump/ret pairs instead of one.
> I would be surprised if calling a function has any overhead on x86_64,
> since the function call is a jump that can be predicted perfectly.
Indeed.
<b
Bryan> The memcpy32 routine is, but __raw_memcpy_toio32 simply
Bryan> calls it, so we have two jump/ret pairs instead of one.
Oh, I think you're misunderstanding Andrew's idea. Just create a
generic __raw_memcpy_toio32() that is always compiled, but mark it
with attribute((weak)). Then x86_64 can define its own version of
__raw_memcpy_toio32(), which will override the weak generic version.
- R.
On Wed, 2006-01-11 at 11:01 -0800, Roland Dreier wrote:
> Oh, I think you're misunderstanding Andrew's idea. Just create a
> generic __raw_memcpy_toio32() that is always compiled, but mark it
> with attribute((weak)). Then x86_64 can define its own version of
> __raw_memcpy_toio32(), which will override the weak generic version.
No, I understood that. But my original x86_64 routine was inline, which
would have left the out-of-line version compiled, but not used, on
x86_64.
<b
On Tue, Jan 10, 2006 at 03:32:57PM -0800, Andrew Morton wrote:
> "Bryan O'Sullivan" <[email protected]> wrote:
> >
> > On Tue, 2006-01-10 at 14:05 -0800, Andrew Morton wrote:
> >
> > > It's kinda fun playing Brian along like this ;)
> >
> > A regular barrel of monkeys, indeed...
> >
> > > One option is to just stick the thing in an existing lib/ or kernel/ file
> > > and mark it __attribute__((weak)). That way architectures can override it
> > > for free with no ifdefs, no Makefile trickery, no Kconfig trickery, etc.
> >
> > I'm easy. Would you prefer to take that, or the Kconfig-trickery-based
> > patch series I already posted earlier?
> >
>
> Unless someone can think of a problem with attribute(weak), I think you'll
> find that it's the simplest-by-far solution.
__attribute__((weak)) can turn compile error into runtime errors - you
won't notice at compile time if it was forgotten to compile the
non-weak version into the kernel (e.g. due to a typo in the Makefile).
Patch 05/17 from the 2.6.15.1 patchset contains a fix for such a bug
present in 2.6.15.
A variation of this problem can occur in cases like __raw_memcpy_toio32
if it was forgotten to compile the non-weak version into the kernel and
the kernel therefore uses the non-optimized version. That's not fatal,
but it might take years until someone notices that there might be a few
percent of performance missing.
cu
Adrian
--
"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed