This patchset provides a few optimisations related to IP checksum functions.
Christophe Leroy (2):
powerpc: put csum_tcpudp_magic inline
powerpc: add support for csum_add()
arch/powerpc/include/asm/checksum.h | 37 ++++++++++++++++++++++++++++---------
arch/powerpc/lib/checksum_32.S | 16 ----------------
arch/powerpc/lib/checksum_64.S | 21 ---------------------
3 files changed, 28 insertions(+), 46 deletions(-)
--
2.1.0
csum_tcpudp_magic() is only a few instructions, and does modify
really few registers. So it is not worth having it as a separate
function and suffer function branching and saving of volatile
registers.
This patch makes it inline by use of the already existing
csum_tcpudp_nofold() function.
Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/include/asm/checksum.h | 21 ++++++++++++---------
arch/powerpc/lib/checksum_32.S | 16 ----------------
arch/powerpc/lib/checksum_64.S | 21 ---------------------
3 files changed, 12 insertions(+), 46 deletions(-)
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 8251a3b..5e43d2d 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -20,15 +20,6 @@
extern __sum16 ip_fast_csum(const void *iph, unsigned int ihl);
/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-extern __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
- __wsum sum);
-
-/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
@@ -127,6 +118,18 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
#endif
}
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ unsigned short len,
+ unsigned short proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
#endif
#endif /* __KERNEL__ */
#endif
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index e23a436..d6fab08 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(saddr, daddr, len, proto, sum)
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- addze r0,r0 /* add in final carry */
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 57a0720..f3ef354 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- rldicl r4,r0,32,0 /* fold 64 bit value */
- add r0,r4,r0
- srdi r0,r0,32
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* Computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit).
*
--
2.1.0
The C version of csum_add() as defined in include/net/checksum.h gives the
following assembly in ppc32:
0: 7c 04 1a 14 add r0,r4,r3
4: 7c 64 00 10 subfc r3,r4,r0
8: 7c 63 19 10 subfe r3,r3,r3
c: 7c 63 00 50 subf r3,r3,r0
and the following in ppc64:
0xc000000000001af8 <+0>: add r3,r3,r4
0xc000000000001afc <+4>: cmplw cr7,r3,r4
0xc000000000001b00 <+8>: mfcr r4
0xc000000000001b04 <+12>: rlwinm r4,r4,29,31,31
0xc000000000001b08 <+16>: add r3,r4,r3
0xc000000000001b0c <+20>: clrldi r3,r3,32
0xc000000000001b10 <+24>: blr
include/net/checksum.h also offers the possibility to define an arch specific
function.
This patch provides a specific csum_add() inline function.
Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/include/asm/checksum.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
index 5e43d2d..e8d9ef4 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+#ifdef __powerpc64__
+ u64 res = (__force u64)csum;
+
+ res += (__force u64)addend;
+ return (__force __wsum)((u32)res + (res >> 32));
+#else
+ asm("addc %0,%0,%1;"
+ "addze %0,%0;"
+ : "+r" (csum) : "r" (addend));
+ return csum;
+#endif
+}
+
#endif
#endif /* __KERNEL__ */
#endif
--
2.1.0
From: Linuxppc-dev Christophe Leroy
> Sent: 19 May 2015 16:19
...
> diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
> index 5e43d2d..e8d9ef4 100644
> --- a/arch/powerpc/include/asm/checksum.h
> +++ b/arch/powerpc/include/asm/checksum.h
> @@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
> return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
> }
>
> +#define HAVE_ARCH_CSUM_ADD
> +static inline __wsum csum_add(__wsum csum, __wsum addend)
> +{
> +#ifdef __powerpc64__
> + u64 res = (__force u64)csum;
> +
> + res += (__force u64)addend;
> + return (__force __wsum)((u32)res + (res >> 32));
> +#else
> + asm("addc %0,%0,%1;"
> + "addze %0,%0;"
> + : "+r" (csum) : "r" (addend));
> + return csum;
> +#endif
I'd have thought it better to test for the cpu type where you want the
'asm' variant, and then fall back on the C version for all others.
I know (well suspect) there are only two cases here.
I'd also have thought that the 64bit C version above would be generally 'good'.
David
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Fri, 2015-05-22 at 15:57 +0000, David Laight wrote:
> From: Linuxppc-dev Christophe Leroy
> > Sent: 19 May 2015 16:19
> ...
> > diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h
> > index 5e43d2d..e8d9ef4 100644
> > --- a/arch/powerpc/include/asm/checksum.h
> > +++ b/arch/powerpc/include/asm/checksum.h
> > @@ -130,6 +130,22 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
> > return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
> > }
> >
> > +#define HAVE_ARCH_CSUM_ADD
> > +static inline __wsum csum_add(__wsum csum, __wsum addend)
> > +{
> > +#ifdef __powerpc64__
> > + u64 res = (__force u64)csum;
> > +
> > + res += (__force u64)addend;
> > + return (__force __wsum)((u32)res + (res >> 32));
> > +#else
> > + asm("addc %0,%0,%1;"
> > + "addze %0,%0;"
> > + : "+r" (csum) : "r" (addend));
> > + return csum;
> > +#endif
>
> I'd have thought it better to test for the cpu type where you want the
> 'asm' variant, and then fall back on the C version for all others.
> I know (well suspect) there are only two cases here.
Usually it's more readable to see "if (x) ... else ..." than "if (!
x) ... else ..." and 64-bit is what has a symbol defined.
> I'd also have thought that the 64bit C version above would be generally 'good'.
It doesn't generate the addc/addze sequence. At least with GCC 4.8.2,
it does something like:
mr tmp0, csum
li tmp1, 0
li tmp2, 0
addc tmp3, addend, tmp0
adde csum, tmp2, tmp1
add csum, csum, tmp3
-Scott
On Fri, May 22, 2015 at 02:32:42PM -0500, Scott Wood wrote:
> > I'd also have thought that the 64bit C version above would be generally 'good'.
>
> It doesn't generate the addc/addze sequence. At least with GCC 4.8.2,
> it does something like:
>
> mr tmp0, csum
> li tmp1, 0
> li tmp2, 0
> addc tmp3, addend, tmp0
> adde csum, tmp2, tmp1
> add csum, csum, tmp3
Right. Don't expect older compilers to do sane things here.
All this begs a question... If it is worth spending so much time
micro-optimising this, why not pick the low-hanging fruit first?
Having a 32-bit accumulator for ones' complement sums, on a 64-bit
system, is not such a great idea.
Segher
On Fri, 2015-05-22 at 16:39 -0500, Segher Boessenkool wrote:
> On Fri, May 22, 2015 at 02:32:42PM -0500, Scott Wood wrote:
> > > I'd also have thought that the 64bit C version above would be generally 'good'.
> >
> > It doesn't generate the addc/addze sequence. At least with GCC 4.8.2,
> > it does something like:
> >
> > mr tmp0, csum
> > li tmp1, 0
> > li tmp2, 0
> > addc tmp3, addend, tmp0
> > adde csum, tmp2, tmp1
> > add csum, csum, tmp3
>
> Right. Don't expect older compilers to do sane things here.
>
> All this begs a question... If it is worth spending so much time
> micro-optimising this, why not pick the low-hanging fruit first?
> Having a 32-bit accumulator for ones' complement sums, on a 64-bit
> system, is not such a great idea.
That would be a more intrusive change -- not (comparatively) low-hanging
fruit. Plus, the person submitting these patches is focused on 32-bit.
-Scott
From: Scott Wood ...
> > I'd also have thought that the 64bit C version above would be generally 'good'.
>
> It doesn't generate the addc/addze sequence. At least with GCC 4.8.2,
> it does something like:
>
> mr tmp0, csum
> li tmp1, 0
> li tmp2, 0
> addc tmp3, addend, tmp0
> adde csum, tmp2, tmp1
> add csum, csum, tmp3
I was thinking of all 64bit targets, not 32bit ones.
David
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
On Tue, 2015-05-26 at 13:57 +0000, David Laight wrote:
> From: Scott Wood ...
> > > I'd also have thought that the 64bit C version above would be
> > > generally 'good'.
> >
> > It doesn't generate the addc/addze sequence. At least with GCC
> > 4.8.2,
> > it does something like:
> >
> > mr tmp0, csum
> > li tmp1, 0
> > li tmp2, 0
> > addc tmp3, addend, tmp0
> > adde csum, tmp2, tmp1
> > add csum, csum, tmp3
>
> I was thinking of all 64bit targets, not 32bit ones.
Oh, you mean move it out of arch/powerpc? Sounds reasonable, but
someone should probably check what the resulting code looks like on
other common arches. OTOH, if we're going to modify non-arch code,
that might be a good opportunity to implement Segher's suggestion and
move to a 64-bit accumulator.
-Scott