strncmp is defined in assembly for bootup, but it is not defined in the
normal running kernel. This patch takes the strncmp code from the bootup
and copies it to the kernel proper.
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/powerpc/kernel/ppc_ksyms.c | 1 +
arch/powerpc/lib/string.S | 12 ++++++++++++
include/asm-powerpc/string.h | 2 ++
3 files changed, 15 insertions(+)
Index: linux-sched-devel.git/arch/powerpc/kernel/ppc_ksyms.c
===================================================================
--- linux-sched-devel.git.orig/arch/powerpc/kernel/ppc_ksyms.c 2008-02-27 14:01:38.000000000 -0800
+++ linux-sched-devel.git/arch/powerpc/kernel/ppc_ksyms.c 2008-02-29 07:24:22.000000000 -0800
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(strncpy);
EXPORT_SYMBOL(strcat);
EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(csum_partial);
EXPORT_SYMBOL(csum_partial_copy_generic);
Index: linux-sched-devel.git/arch/powerpc/lib/string.S
===================================================================
--- linux-sched-devel.git.orig/arch/powerpc/lib/string.S 2008-02-27 14:01:38.000000000 -0800
+++ linux-sched-devel.git/arch/powerpc/lib/string.S 2008-02-29 07:24:22.000000000 -0800
@@ -75,6 +75,18 @@ _GLOBAL(strcmp)
beq 1b
blr
+_GLOBAL(strncmp)
+ mtctr r5
+ addi r5,r3,-1
+ addi r4,r4,-1
+1: lbzu r3,1(r5)
+ cmpwi 1,r3,0
+ lbzu r0,1(r4)
+ subf. r3,r0,r3
+ beqlr 1
+ bdnzt eq,1b
+ blr
+
_GLOBAL(strlen)
addi r4,r3,-1
1: lbzu r0,1(r4)
Index: linux-sched-devel.git/include/asm-powerpc/string.h
===================================================================
--- linux-sched-devel.git.orig/include/asm-powerpc/string.h 2008-02-27 14:01:58.000000000 -0800
+++ linux-sched-devel.git/include/asm-powerpc/string.h 2008-02-29 07:24:22.000000000 -0800
@@ -7,6 +7,7 @@
#define __HAVE_ARCH_STRNCPY
#define __HAVE_ARCH_STRLEN
#define __HAVE_ARCH_STRCMP
+#define __HAVE_ARCH_STRNCMP
#define __HAVE_ARCH_STRCAT
#define __HAVE_ARCH_MEMSET
#define __HAVE_ARCH_MEMCPY
@@ -18,6 +19,7 @@ extern char * strcpy(char *,const char *
extern char * strncpy(char *,const char *, __kernel_size_t);
extern __kernel_size_t strlen(const char *);
extern int strcmp(const char *,const char *);
+extern int strncmp(const char *,const char *,__kernel_size_t);
extern char * strcat(char *, const char *);
extern void * memset(void *,int,__kernel_size_t);
extern void * memcpy(void *,const void *,__kernel_size_t);
On Fri, 2008-02-29 at 11:04 -0500, Steven Rostedt wrote:
> strncmp is defined in assembly for bootup, but it is not defined in the
> normal running kernel. This patch takes the strncmp code from the bootup
> and copies it to the kernel proper.
>
> Signed-off-by: Steven Rostedt <[email protected]>
> ---
Do we have any indication that it performs better than the C one ?
Ben.
> arch/powerpc/kernel/ppc_ksyms.c | 1 +
> arch/powerpc/lib/string.S | 12 ++++++++++++
> include/asm-powerpc/string.h | 2 ++
> 3 files changed, 15 insertions(+)
>
> Index: linux-sched-devel.git/arch/powerpc/kernel/ppc_ksyms.c
> ===================================================================
> --- linux-sched-devel.git.orig/arch/powerpc/kernel/ppc_ksyms.c 2008-02-27 14:01:38.000000000 -0800
> +++ linux-sched-devel.git/arch/powerpc/kernel/ppc_ksyms.c 2008-02-29 07:24:22.000000000 -0800
> @@ -78,6 +78,7 @@ EXPORT_SYMBOL(strncpy);
> EXPORT_SYMBOL(strcat);
> EXPORT_SYMBOL(strlen);
> EXPORT_SYMBOL(strcmp);
> +EXPORT_SYMBOL(strncmp);
>
> EXPORT_SYMBOL(csum_partial);
> EXPORT_SYMBOL(csum_partial_copy_generic);
> Index: linux-sched-devel.git/arch/powerpc/lib/string.S
> ===================================================================
> --- linux-sched-devel.git.orig/arch/powerpc/lib/string.S 2008-02-27 14:01:38.000000000 -0800
> +++ linux-sched-devel.git/arch/powerpc/lib/string.S 2008-02-29 07:24:22.000000000 -0800
> @@ -75,6 +75,18 @@ _GLOBAL(strcmp)
> beq 1b
> blr
>
> +_GLOBAL(strncmp)
> + mtctr r5
> + addi r5,r3,-1
> + addi r4,r4,-1
> +1: lbzu r3,1(r5)
> + cmpwi 1,r3,0
> + lbzu r0,1(r4)
> + subf. r3,r0,r3
> + beqlr 1
> + bdnzt eq,1b
> + blr
> +
> _GLOBAL(strlen)
> addi r4,r3,-1
> 1: lbzu r0,1(r4)
> Index: linux-sched-devel.git/include/asm-powerpc/string.h
> ===================================================================
> --- linux-sched-devel.git.orig/include/asm-powerpc/string.h 2008-02-27 14:01:58.000000000 -0800
> +++ linux-sched-devel.git/include/asm-powerpc/string.h 2008-02-29 07:24:22.000000000 -0800
> @@ -7,6 +7,7 @@
> #define __HAVE_ARCH_STRNCPY
> #define __HAVE_ARCH_STRLEN
> #define __HAVE_ARCH_STRCMP
> +#define __HAVE_ARCH_STRNCMP
> #define __HAVE_ARCH_STRCAT
> #define __HAVE_ARCH_MEMSET
> #define __HAVE_ARCH_MEMCPY
> @@ -18,6 +19,7 @@ extern char * strcpy(char *,const char *
> extern char * strncpy(char *,const char *, __kernel_size_t);
> extern __kernel_size_t strlen(const char *);
> extern int strcmp(const char *,const char *);
> +extern int strncmp(const char *,const char *,__kernel_size_t);
> extern char * strcat(char *, const char *);
> extern void * memset(void *,int,__kernel_size_t);
> extern void * memcpy(void *,const void *,__kernel_size_t);
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> [email protected]
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
On Sat, 1 Mar 2008, Benjamin Herrenschmidt wrote:
>
> Do we have any indication that it performs better than the C one ?
See below.
>
> Ben.
>
> >
> > +_GLOBAL(strncmp)
> > + mtctr r5
> > + addi r5,r3,-1
> > + addi r4,r4,-1
> > +1: lbzu r3,1(r5)
> > + cmpwi 1,r3,0
> > + lbzu r0,1(r4)
> > + subf. r3,r0,r3
> > + beqlr 1
> > + bdnzt eq,1b
> > + blr
> > +
And here's the objdump of the C version:
0000000000000080 <.strncmp>:
80: fb e1 ff f0 std r31,-16(r1)
84: f8 21 ff c1 stdu r1,-64(r1)
88: 7c 69 1b 78 mr r9,r3
8c: 7c a0 2b 79 mr. r0,r5
90: 38 60 00 00 li r3,0
94: 7c 09 03 a6 mtctr r0
98: 7c 3f 0b 78 mr r31,r1
9c: 41 82 00 68 beq- 104 <.strncmp+0x84>
a0: 89 69 00 00 lbz r11,0(r9)
a4: 88 04 00 00 lbz r0,0(r4)
a8: 7c 00 58 50 subf r0,r0,r11
ac: 78 00 06 20 clrldi r0,r0,56
b0: 2f a0 00 00 cmpdi cr7,r0,0
b4: 7c 00 07 74 extsb r0,r0
b8: 7c 03 03 78 mr r3,r0
bc: 40 9e 00 48 bne- cr7,104 <.strncmp+0x84>
c0: 2f ab 00 00 cmpdi cr7,r11,0
c4: 41 9e 00 40 beq- cr7,104 <.strncmp+0x84>
c8: 38 84 00 01 addi r4,r4,1
cc: 38 69 00 01 addi r3,r9,1
d0: 42 40 00 30 bdz- 100 <.strncmp+0x80>
d4: 88 03 00 00 lbz r0,0(r3)
d8: 89 24 00 00 lbz r9,0(r4)
dc: 38 63 00 01 addi r3,r3,1
e0: 38 84 00 01 addi r4,r4,1
e4: 2f 20 00 00 cmpdi cr6,r0,0
e8: 7c 09 00 50 subf r0,r9,r0
ec: 78 00 06 20 clrldi r0,r0,56
f0: 2f a0 00 00 cmpdi cr7,r0,0
f4: 7c 00 07 74 extsb r0,r0
f8: 40 9e 00 08 bne- cr7,100 <.strncmp+0x80>
fc: 40 9a ff d4 bne+ cr6,d0 <.strncmp+0x50>
100: 7c 03 03 78 mr r3,r0
104: e8 21 00 00 ld r1,0(r1)
108: eb e1 ff f0 ld r31,-16(r1)
10c: 4e 80 00 20 blr
I'll let you decide ;-)
Even if it was logically faster (which I still doubt) it's a hell of a lot
of cache lines to waste.
-- Steve
On Fri, Feb 29, 2008 at 10:56:45PM -0500, Steven Rostedt wrote:
>
> On Sat, 1 Mar 2008, Benjamin Herrenschmidt wrote:
> >
> > Do we have any indication that it performs better than the C one ?
>
> See below.
>
> >
> > Ben.
> >
>
> > >
> > > +_GLOBAL(strncmp)
> > > + mtctr r5
> > > + addi r5,r3,-1
> > > + addi r4,r4,-1
> > > +1: lbzu r3,1(r5)
> > > + cmpwi 1,r3,0
> > > + lbzu r0,1(r4)
> > > + subf. r3,r0,r3
> > > + beqlr 1
> > > + bdnzt eq,1b
> > > + blr
> > > +
>
>
> And here's the objdump of the C version:
>
> 0000000000000080 <.strncmp>:
> 80: fb e1 ff f0 std r31,-16(r1)
> 84: f8 21 ff c1 stdu r1,-64(r1)
> 88: 7c 69 1b 78 mr r9,r3
> 8c: 7c a0 2b 79 mr. r0,r5
> 90: 38 60 00 00 li r3,0
> 94: 7c 09 03 a6 mtctr r0
> 98: 7c 3f 0b 78 mr r31,r1
> 9c: 41 82 00 68 beq- 104 <.strncmp+0x84>
> a0: 89 69 00 00 lbz r11,0(r9)
> a4: 88 04 00 00 lbz r0,0(r4)
> a8: 7c 00 58 50 subf r0,r0,r11
> ac: 78 00 06 20 clrldi r0,r0,56
> b0: 2f a0 00 00 cmpdi cr7,r0,0
> b4: 7c 00 07 74 extsb r0,r0
> b8: 7c 03 03 78 mr r3,r0
> bc: 40 9e 00 48 bne- cr7,104 <.strncmp+0x84>
> c0: 2f ab 00 00 cmpdi cr7,r11,0
> c4: 41 9e 00 40 beq- cr7,104 <.strncmp+0x84>
> c8: 38 84 00 01 addi r4,r4,1
> cc: 38 69 00 01 addi r3,r9,1
> d0: 42 40 00 30 bdz- 100 <.strncmp+0x80>
> d4: 88 03 00 00 lbz r0,0(r3)
> d8: 89 24 00 00 lbz r9,0(r4)
> dc: 38 63 00 01 addi r3,r3,1
> e0: 38 84 00 01 addi r4,r4,1
> e4: 2f 20 00 00 cmpdi cr6,r0,0
> e8: 7c 09 00 50 subf r0,r9,r0
> ec: 78 00 06 20 clrldi r0,r0,56
> f0: 2f a0 00 00 cmpdi cr7,r0,0
> f4: 7c 00 07 74 extsb r0,r0
> f8: 40 9e 00 08 bne- cr7,100 <.strncmp+0x80>
> fc: 40 9a ff d4 bne+ cr6,d0 <.strncmp+0x50>
> 100: 7c 03 03 78 mr r3,r0
> 104: e8 21 00 00 ld r1,0(r1)
> 108: eb e1 ff f0 ld r31,-16(r1)
> 10c: 4e 80 00 20 blr
>
>
> I'll let you decide ;-)
>
> Even if it was logically faster (which I still doubt) it's a hell of a lot
> of cache lines to waste.
Indeed, but there are some corner cases that the C code handles. Like
a length of 0 which may lead to infinite loop in the asm code.
OTOH, I'm a bit surprised by the extsb instructions in the compiler generated
code. We don't compile with -fsigned-char, do we? The clrldi
instructions are also extremely stupid.
Now that I think a bit more about it, I believe that the C version is
incorrect: the clrldi/extsb dance takes a value between -255 and +255
and collapses it into the -128 to 127 range, meaning that the return
value may be wrong if we rely on the sign of the result. So unless I
miss something, the problem is much more serious than just stupid code
(I had just a look at the libc version in C and characters are cast to
unsigned char before the comparison).
Regards,
Gabriel
Gabriel Paubert <[email protected]> writes:
> Now that I think a bit more about it, I believe that the C version is
> incorrect: the clrldi/extsb dance takes a value between -255 and +255
> and collapses it into the -128 to 127 range, meaning that the return
> value may be wrong if we rely on the sign of the result. So unless I
> miss something, the problem is much more serious than just stupid code
> (I had just a look at the libc version in C and characters are cast to
> unsigned char before the comparison).
The latter is explicitly required by the C standard. Ie. even if your
characters are signed they are always compared as unsigned by
strcmp/strncmp/memcmp.
Andreas.
--
Andreas Schwab, SuSE Labs, [email protected]
SuSE Linux Products GmbH, Maxfeldstra?e 5, 90409 N?rnberg, Germany
PGP key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5
"And now for something completely different."
>> Even if it was logically faster (which I still doubt) it's a hell of
>> a lot
>> of cache lines to waste.
Yeah, 1 on 64-bit and 3 on 32-bit, that's a terrible lot.</sarcasm>
> Indeed, but there are some corner cases that the C code handles. Like
> a length of 0 which may lead to infinite loop in the asm code.
>
> OTOH, I'm a bit surprised by the extsb instructions in the compiler
> generated
> code. We don't compile with -fsigned-char, do we? The clrldi
> instructions are also extremely stupid.
Those are both necessary to be equivalent to the C code, which uses
signed char explicitly. It is generally considered a Good Thing(tm)
for the compiler to generate assembler code equivalent to the C code,
even if the C code is wrong.
> Now that I think a bit more about it, I believe that the C version is
> incorrect
It is. It's a great entry for the IOCCC as well.
I just tested the following (can't guarantee it's correct, just a PoC):
int strncmp(const char *s1, const char *s2, unsigned long /*size_t*/
len)
{
while (len--) {
unsigned char c1, c2;
c1 = *s1++;
c2 = *s2++;
int cmp = c1 - c2;
if (cmp)
return cmp;
if (c1 == 0 || c2 == 0)
break;
}
return 0;
}
which generates (with GCC-4.2.3)
strncmp:
addi 5,5,1
mtctr 5
.L2:
bdz .L11
lbz 0,0(3)
addi 3,3,1
lbz 9,0(4)
addi 4,4,1
cmpwi 7,0,0
subf. 0,9,0
cmpwi 6,9,0
bne- 0,.L4
beq- 7,.L4
bne+ 6,.L2
.L4:
mr 3,0
blr
.L11:
li 0,0
mr 3,0
blr
which isn't horrid, although it does some weirdish things obviously.
Current GCC-4.4.0 generates
strncmp:
addi 5,5,1
mr 10,3
mtctr 5
li 11,0
bdz .L7
.p2align 4,,15
.L4:
lbzx 0,10,11
lbzx 9,4,11
addi 11,11,1
subf. 3,9,0
cmpwi 6,9,0
cmpwi 7,0,0
bnelr 0
beqlr 7
beqlr 6
bdnz .L4
.L7:
li 3,0
blr
which is about as good as it can get (well, it didn't realise you
only need to test one of c1, c2 for zero. Did I say this was just
proof-of-concept code?)
Segher
Benjamin Herrenschmidt writes:
> Do we have any indication that it performs better than the C one ?
I would expect it to, given that the assembler one has two branches in
the per-byte loop compared to 3 in the C version.
Paul.
>> Do we have any indication that it performs better than the C one ?
>
> I would expect it to, given that the assembler one has two branches in
> the per-byte loop compared to 3 in the C version.
But really, does it matter for strncmp() in the kernel?
Anyway, this asm code has bugs, as do both the current C version in the
kernel, and the code I posted. We need to do better :-)
Segher
Segher Boessenkool writes:
> Anyway, this asm code has bugs, as do both the current C version in the
> kernel, and the code I posted. We need to do better :-)
The only bug I know of in the asm code is the behaviour when the count
is zero. Do you know of any other?
Paul.
>> Anyway, this asm code has bugs, as do both the current C version in
>> the
>> kernel, and the code I posted. We need to do better :-)
>
> The only bug I know of in the asm code is the behaviour when the count
> is zero. Do you know of any other?
No, that's the bug I meant. Sorry for using such inexact language.
Segher