tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
head: d31c3c683ee668ba5d87c0730610442fd672525f
commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
git fetch --no-tags tip x86/core
git checkout d31c3c683ee668ba5d87c0730610442fd672525f
# save the attached .config to linux build tree
make W=1 ARCH=um SUBARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
>> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
| ^~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +/load_unaligned_zeropad +98 arch/x86/um/../lib/csum-partial_64.c
23
24 /*
25 * Do a checksum on an arbitrary memory area.
26 * Returns a 32bit checksum.
27 *
28 * This isn't as time critical as it used to be because many NICs
29 * do hardware checksumming these days.
30 *
31 * Still, with CHECKSUM_COMPLETE this is called to compute
32 * checksums on IPv6 headers (40 bytes) and other small parts.
33 * it's best to have buff aligned on a 64-bit boundary
34 */
35 __wsum csum_partial(const void *buff, int len, __wsum sum)
36 {
37 u64 temp64 = (__force u64)sum;
38 unsigned odd, result;
39
40 odd = 1 & (unsigned long) buff;
41 if (unlikely(odd)) {
42 if (unlikely(len == 0))
43 return sum;
44 temp64 += (*(unsigned char *)buff << 8);
45 len--;
46 buff++;
47 }
48
49 while (unlikely(len >= 64)) {
50 asm("addq 0*8(%[src]),%[res]\n\t"
51 "adcq 1*8(%[src]),%[res]\n\t"
52 "adcq 2*8(%[src]),%[res]\n\t"
53 "adcq 3*8(%[src]),%[res]\n\t"
54 "adcq 4*8(%[src]),%[res]\n\t"
55 "adcq 5*8(%[src]),%[res]\n\t"
56 "adcq 6*8(%[src]),%[res]\n\t"
57 "adcq 7*8(%[src]),%[res]\n\t"
58 "adcq $0,%[res]"
59 : [res] "+r" (temp64)
60 : [src] "r" (buff)
61 : "memory");
62 buff += 64;
63 len -= 64;
64 }
65
66 if (len & 32) {
67 asm("addq 0*8(%[src]),%[res]\n\t"
68 "adcq 1*8(%[src]),%[res]\n\t"
69 "adcq 2*8(%[src]),%[res]\n\t"
70 "adcq 3*8(%[src]),%[res]\n\t"
71 "adcq $0,%[res]"
72 : [res] "+r" (temp64)
73 : [src] "r" (buff)
74 : "memory");
75 buff += 32;
76 }
77 if (len & 16) {
78 asm("addq 0*8(%[src]),%[res]\n\t"
79 "adcq 1*8(%[src]),%[res]\n\t"
80 "adcq $0,%[res]"
81 : [res] "+r" (temp64)
82 : [src] "r" (buff)
83 : "memory");
84 buff += 16;
85 }
86 if (len & 8) {
87 asm("addq 0*8(%[src]),%[res]\n\t"
88 "adcq $0,%[res]"
89 : [res] "+r" (temp64)
90 : [src] "r" (buff)
91 : "memory");
92 buff += 8;
93 }
94 if (len & 7) {
95 unsigned int shift = (8 - (len & 7)) * 8;
96 unsigned long trail;
97
> 98 trail = (load_unaligned_zeropad(buff) << shift) >> shift;
99
100 asm("addq %[trail],%[res]\n\t"
101 "adcq $0,%[res]"
102 : [res] "+r" (temp64)
103 : [trail] "r" (trail));
104 }
105 result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
106 if (unlikely(odd)) {
107 result = from32to16(result);
108 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
109 }
110 return (__force __wsum)result;
111 }
112 EXPORT_SYMBOL(csum_partial);
113
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]
On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <[email protected]> wrote:
>
> tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> head: d31c3c683ee668ba5d87c0730610442fd672525f
> commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> config: um-x86_64_defconfig (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> reproduce (this is a W=1 build):
> # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> git fetch --no-tags tip x86/core
> git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> # save the attached .config to linux build tree
> make W=1 ARCH=um SUBARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <[email protected]>
>
> All errors (new ones prefixed by >>):
>
> arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> 98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> | ^~~~~~~~~~~~~~~~~~~~~~
> cc1: some warnings being treated as errors
>
>
Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <[email protected]> wrote:
> >
> > tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > head: d31c3c683ee668ba5d87c0730610442fd672525f
> > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > config: um-x86_64_defconfig (attached as .config)
> > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > reproduce (this is a W=1 build):
> > # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > git fetch --no-tags tip x86/core
> > git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > # save the attached .config to linux build tree
> > make W=1 ARCH=um SUBARCH=x86_64
> >
> > If you fix the issue, kindly add following tag as appropriate
> > Reported-by: kernel test robot <[email protected]>
> >
> > All errors (new ones prefixed by >>):
> >
> > arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > 98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > | ^~~~~~~~~~~~~~~~~~~~~~
> > cc1: some warnings being treated as errors
> >
> >
>
> Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
Perhaps something like the following ?
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..d419b9345d6dba2e924887671bc6f11c3e17ebd7
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -91,12 +91,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
: "memory");
buff += 8;
}
- if (len & 7) {
- unsigned int shift = (8 - (len & 7)) * 8;
+ len &= 7;
+ if (len) {
unsigned long trail;
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+ union {
+ unsigned long ulval;
+ u8 bytes[sizeof(long)];
+ } v;
- trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+ v.ulval = 0;
+ memcpy(v.bytes, buff, len);
+ trail = v.ulval;
+#else
+ unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
+ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
asm("addq %[trail],%[res]\n\t"
"adcq $0,%[res]"
: [res] "+r" (temp64)
On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <[email protected]> wrote:
> >
> > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <[email protected]> wrote:
> > >
> > > tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > head: d31c3c683ee668ba5d87c0730610442fd672525f
> > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > config: um-x86_64_defconfig (attached as .config)
> > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > reproduce (this is a W=1 build):
> > > # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > > git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > > git fetch --no-tags tip x86/core
> > > git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > > # save the attached .config to linux build tree
> > > make W=1 ARCH=um SUBARCH=x86_64
> > >
> > > If you fix the issue, kindly add following tag as appropriate
> > > Reported-by: kernel test robot <[email protected]>
> > >
> > > All errors (new ones prefixed by >>):
> > >
> > > arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > >> arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > > 98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > > | ^~~~~~~~~~~~~~~~~~~~~~
> > > cc1: some warnings being treated as errors
> > >
> > >
> >
> > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
>
> Perhaps something like the following ?
Dear um folks, is this indeed the best solution? It's a bit sad to have
to add this to x86_64, but if that's the way it is...
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 5ec35626945b6db2f7f41c6d46d5e422810eac46..d419b9345d6dba2e924887671bc6f11c3e17ebd7
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -91,12 +91,23 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> : "memory");
> buff += 8;
> }
> - if (len & 7) {
> - unsigned int shift = (8 - (len & 7)) * 8;
> + len &= 7;
> + if (len) {
> unsigned long trail;
> +#ifndef CONFIG_DCACHE_WORD_ACCESS
> + union {
> + unsigned long ulval;
> + u8 bytes[sizeof(long)];
> + } v;
>
> - trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> + v.ulval = 0;
> + memcpy(v.bytes, buff, len);
> + trail = v.ulval;
> +#else
> + unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
>
> + trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> +#endif
> asm("addq %[trail],%[res]\n\t"
> "adcq $0,%[res]"
> : [res] "+r" (temp64)
On Thu, 2021-11-18 at 17:00 +0100, Peter Zijlstra wrote:
> On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> > On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <[email protected]> wrote:
> > >
> > > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <[email protected]> wrote:
> > > >
> > > > tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > > head: d31c3c683ee668ba5d87c0730610442fd672525f
> > > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > > config: um-x86_64_defconfig (attached as .config)
> > > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > > reproduce (this is a W=1 build):
> > > > # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > > > git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > > > git fetch --no-tags tip x86/core
> > > > git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > > > # save the attached .config to linux build tree
> > > > make W=1 ARCH=um SUBARCH=x86_64
> > > >
> > > > If you fix the issue, kindly add following tag as appropriate
> > > > Reported-by: kernel test robot <[email protected]>
> > > >
> > > > All errors (new ones prefixed by >>):
> > > >
> > > > arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > > > > arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > > > 98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > > > | ^~~~~~~~~~~~~~~~~~~~~~
> > > > cc1: some warnings being treated as errors
> > > >
> > > >
> > >
> > > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
> >
> > Perhaps something like the following ?
>
> Dear um folks, is this indeed the best solution? It's a bit sad to have
> to add this to x86_64, but if that's the way it is...
>
I guess we can add load_unaligned_zeropad() or even just somehow add the
include with it (asm/word-at-a-time.h) from x86?
johannes
On Thu, Nov 18, 2021 at 8:27 AM Johannes Berg <[email protected]> wrote:
>
> On Thu, 2021-11-18 at 17:00 +0100, Peter Zijlstra wrote:
> > On Wed, Nov 17, 2021 at 11:40:35AM -0800, Eric Dumazet wrote:
> > > On Wed, Nov 17, 2021 at 10:55 AM Eric Dumazet <[email protected]> wrote:
> > > >
> > > > On Wed, Nov 17, 2021 at 10:46 AM kernel test robot <[email protected]> wrote:
> > > > >
> > > > > tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core
> > > > > head: d31c3c683ee668ba5d87c0730610442fd672525f
> > > > > commit: d31c3c683ee668ba5d87c0730610442fd672525f [1/1] x86/csum: Rewrite/optimize csum_partial()
> > > > > config: um-x86_64_defconfig (attached as .config)
> > > > > compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
> > > > > reproduce (this is a W=1 build):
> > > > > # https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=d31c3c683ee668ba5d87c0730610442fd672525f
> > > > > git remote add tip https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
> > > > > git fetch --no-tags tip x86/core
> > > > > git checkout d31c3c683ee668ba5d87c0730610442fd672525f
> > > > > # save the attached .config to linux build tree
> > > > > make W=1 ARCH=um SUBARCH=x86_64
> > > > >
> > > > > If you fix the issue, kindly add following tag as appropriate
> > > > > Reported-by: kernel test robot <[email protected]>
> > > > >
> > > > > All errors (new ones prefixed by >>):
> > > > >
> > > > > arch/x86/um/../lib/csum-partial_64.c: In function 'csum_partial':
> > > > > > > arch/x86/um/../lib/csum-partial_64.c:98:12: error: implicit declaration of function 'load_unaligned_zeropad' [-Werror=implicit-function-declaration]
> > > > > 98 | trail = (load_unaligned_zeropad(buff) << shift) >> shift;
> > > > > | ^~~~~~~~~~~~~~~~~~~~~~
> > > > > cc1: some warnings being treated as errors
> > > > >
> > > > >
> > > >
> > > > Hmmm... it seems we need to guard this with CONFIG_DCACHE_WORD_ACCESS ?
> > >
> > > Perhaps something like the following ?
> >
> > Dear um folks, is this indeed the best solution? It's a bit sad to have
> > to add this to x86_64, but if that's the way it is...
> >
>
> I guess we can add load_unaligned_zeropad() or even just somehow add the
> include with it (asm/word-at-a-time.h) from x86?
Unless fixups can be handled, the signature of the function needs to
be different.
In UM, we would need to provide a number of bytes that can be read.
On Thu, Nov 18, 2021 at 8:57 AM Eric Dumazet <[email protected]> wrote:
>
> Unless fixups can be handled, the signature of the function needs to
> be different.
>
> In UM, we would need to provide a number of bytes that can be read.
We can make this a bit less ugly of course.
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..7a3c4e7e05c4b21566e1ee3813a071509a9d54ff
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,6 +21,25 @@ static inline unsigned short from32to16(unsigned a)
return b;
}
+
+static inline unsigned long load_partial_long(const void *buff, int len)
+{
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+ union {
+ unsigned long ulval;
+ u8 bytes[sizeof(long)];
+ } v;
+
+ v.ulval = 0;
+ memcpy(v.bytes, buff, len);
+ return v.ulval;
+#else
+ unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
+
+ return (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -91,11 +110,9 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
: "memory");
buff += 8;
}
- if (len & 7) {
- unsigned int shift = (8 - (len & 7)) * 8;
- unsigned long trail;
-
- trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+ len &= 7;
+ if (len) {
+ unsigned long trail = load_partial_long(buff, len);
asm("addq %[trail],%[res]\n\t"
"adcq $0,%[res]"
From: Eric Dumazet <[email protected]>
On Thu, Nov 18, 2021 at 8:57 AM Eric Dumazet <[email protected]> wrote:
>
> Unless fixups can be handled, the signature of the function needs to
> be different.
>
> In UM, we would need to provide a number of bytes that can be read.
We can make this a bit less ugly of course.
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 5ec35626945b6db2f7f41c6d46d5e422810eac46..7a3c4e7e05c4b21566e1ee3813a071509a9d54ff
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -21,6 +21,25 @@ static inline unsigned short from32to16(unsigned a)
return b;
}
+
+static inline unsigned long load_partial_long(const void *buff, int len)
+{
+#ifndef CONFIG_DCACHE_WORD_ACCESS
+ union {
+ unsigned long ulval;
+ u8 bytes[sizeof(long)];
+ } v;
+
+ v.ulval = 0;
+ memcpy(v.bytes, buff, len);
+ return v.ulval;
+#else
+ unsigned int shift = (sizeof(long) - len) * BITS_PER_BYTE;
+
+ return (load_unaligned_zeropad(buff) << shift) >> shift;
+#endif
+}
+
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -91,11 +110,9 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
: "memory");
buff += 8;
}
- if (len & 7) {
- unsigned int shift = (8 - (len & 7)) * 8;
- unsigned long trail;
-
- trail = (load_unaligned_zeropad(buff) << shift) >> shift;
+ len &= 7;
+ if (len) {
+ unsigned long trail = load_partial_long(buff, len);
asm("addq %[trail],%[res]\n\t"
"adcq $0,%[res]"
Hi, I'm not sure if this is intentional or not, but I noticed that the output
of 'csum_partial' is different after this patch. I figured that the checksum
algorithm is fixed so just wanted mention it incase its a bug. If not sorry
for the spam.
Example on x86_64:
Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
len : 11
sum : 0
csum_partial new : 2480936615
csum_partial HEAD: 2472089390
On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <[email protected]> wrote:
>
>
> Hi, I'm not sure if this is intentional or not, but I noticed that the output
> of 'csum_partial' is different after this patch. I figured that the checksum
> algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> for the spam.
>
> Example on x86_64:
>
> Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> len : 11
> sum : 0
>
> csum_partial new : 2480936615
> csum_partial HEAD: 2472089390
No worries.
skb->csum is 32bit, but really what matters is the 16bit folded value.
So make sure to apply csum_fold() before comparing the results.
A minimal C and generic version of csum_fold() would be something like
static unsigned short csum_fold(u32 csum)
{
u32 sum = csum;
sum = (sum & 0xffff) + (sum >> 16);
sum = (sum & 0xffff) + (sum >> 16);
return ~sum;
}
I bet that csum_fold(2480936615) == csum_fold(2472089390)
It would be nice if we had a csum test suite, hint, hint ;)
Thanks !
On Wed, Nov 24, 2021 at 8:56 PM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <[email protected]> wrote:
> >
>
> >
> > Hi, I'm not sure if this is intentional or not, but I noticed that the output
> > of 'csum_partial' is different after this patch. I figured that the checksum
> > algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> > for the spam.
> >
> > Example on x86_64:
> >
> > Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> > len : 11
> > sum : 0
> >
> > csum_partial new : 2480936615
> > csum_partial HEAD: 2472089390
>
> No worries.
>
> skb->csum is 32bit, but really what matters is the 16bit folded value.
>
> So make sure to apply csum_fold() before comparing the results.
>
> A minimal C and generic version of csum_fold() would be something like
>
> static unsigned short csum_fold(u32 csum)
> {
> u32 sum = csum;
> sum = (sum & 0xffff) + (sum >> 16);
> sum = (sum & 0xffff) + (sum >> 16);
> return ~sum;
> }
>
> I bet that csum_fold(2480936615) == csum_fold(2472089390)
>
Correct :)
The outputs seem to match if `buff` is aligned to 64-bit. Still see
difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
The comment at the top says it's "best" to have `buff` 64-bit aligned but
the code logic seems meant to support the misaligned case so not
sure if it's an issue.
Example:
csum_fold(csum_partial) new : 0x3764
csum_fold(csum_partial) HEAD: 0x3a61
buff : [ 11, ea, 75, 76, e9, ab, 86, 48 ]
buff addr : ffff88eaf5fb0001
len : 8
sum_in : 25
> It would be nice if we had a csum test suite, hint, hint ;)
Where in the kernel would that belong?
>
> Thanks !
On Wed, Nov 24, 2021 at 7:41 PM Noah Goldstein <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 8:56 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Wed, Nov 24, 2021 at 5:59 PM Noah Goldstein <[email protected]> wrote:
> > >
> >
> > >
> > > Hi, I'm not sure if this is intentional or not, but I noticed that the output
> > > of 'csum_partial' is different after this patch. I figured that the checksum
> > > algorithm is fixed so just wanted mention it incase its a bug. If not sorry
> > > for the spam.
> > >
> > > Example on x86_64:
> > >
> > > Buff: [ 87, b3, 92, b7, 8b, 53, 96, db, cd, 0f, 7e, 7e ]
> > > len : 11
> > > sum : 0
> > >
> > > csum_partial new : 2480936615
> > > csum_partial HEAD: 2472089390
> >
> > No worries.
> >
> > skb->csum is 32bit, but really what matters is the 16bit folded value.
> >
> > So make sure to apply csum_fold() before comparing the results.
> >
> > A minimal C and generic version of csum_fold() would be something like
> >
> > static unsigned short csum_fold(u32 csum)
> > {
> > u32 sum = csum;
> > sum = (sum & 0xffff) + (sum >> 16);
> > sum = (sum & 0xffff) + (sum >> 16);
> > return ~sum;
> > }
> >
> > I bet that csum_fold(2480936615) == csum_fold(2472089390)
> >
>
> Correct :)
>
> The outputs seem to match if `buff` is aligned to 64-bit. Still see
> difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
>
> The comment at the top says it's "best" to have `buff` 64-bit aligned but
> the code logic seems meant to support the misaligned case so not
> sure if it's an issue.
>
It is an issue in general, not in standard cases because network
headers are aligned.
I think it came when I folded csum_partial() and do_csum(), I forgot
to ror() the seed.
I suspect the following would help:
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
if (unlikely(odd)) {
if (unlikely(len == 0))
return sum;
+ temp64 = ror32((__force u64)sum, 8);
temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
> Example:
>
> csum_fold(csum_partial) new : 0x3764
> csum_fold(csum_partial) HEAD: 0x3a61
>
> buff : [ 11, ea, 75, 76, e9, ab, 86, 48 ]
> buff addr : ffff88eaf5fb0001
> len : 8
> sum_in : 25
>
> > It would be nice if we had a csum test suite, hint, hint ;)
>
> Where in the kernel would that belong?
This could be a module, like lib/test_csum.c
On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <[email protected]> wrote:
>
> It is an issue in general, not in standard cases because network
> headers are aligned.
>
> I think it came when I folded csum_partial() and do_csum(), I forgot
> to ror() the seed.
>
> I suspect the following would help:
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> if (unlikely(odd)) {
> if (unlikely(len == 0))
> return sum;
> + temp64 = ror32((__force u64)sum, 8);
> temp64 += (*(unsigned char *)buff << 8);
> len--;
> buff++;
>
>
It is a bit late here, I will test the following later this week.
We probably can remove one conditional jump at the end of the function
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
if (unlikely(odd)) {
if (unlikely(len == 0))
return sum;
+ temp64 = ror32((__force u64)sum, 8);
temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
+ odd = 8;
}
while (unlikely(len >= 64)) {
@@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
#endif
}
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
- if (unlikely(odd)) {
- result = from32to16(result);
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
- }
+ ror32(result, odd);
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <[email protected]> wrote:
>
> >
> > It is an issue in general, not in standard cases because network
> > headers are aligned.
> >
> > I think it came when I folded csum_partial() and do_csum(), I forgot
> > to ror() the seed.
> >
> > I suspect the following would help:
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > if (unlikely(odd)) {
> > if (unlikely(len == 0))
> > return sum;
> > + temp64 = ror32((__force u64)sum, 8);
> > temp64 += (*(unsigned char *)buff << 8);
> > len--;
> > buff++;
> >
> >
>
> It is a bit late here, I will test the following later this week.
>
> We probably can remove one conditional jump at the end of the function
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> if (unlikely(odd)) {
> if (unlikely(len == 0))
> return sum;
> + temp64 = ror32((__force u64)sum, 8);
> temp64 += (*(unsigned char *)buff << 8);
> len--;
> buff++;
> + odd = 8;
> }
>
> while (unlikely(len >= 64)) {
> @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> #endif
> }
> result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> - if (unlikely(odd)) {
> - result = from32to16(result);
> - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> - }
> + ror32(result, odd);
this would be
result = ror32(result, odd);
definitely time to stop working today for me.
> return (__force __wsum)result;
> }
> EXPORT_SYMBOL(csum_partial);
On Wed, Nov 24, 2021 at 10:20 PM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <[email protected]> wrote:
> >
> > >
> > > It is an issue in general, not in standard cases because network
> > > headers are aligned.
> > >
> > > I think it came when I folded csum_partial() and do_csum(), I forgot
> > > to ror() the seed.
> > >
> > > I suspect the following would help:
> > >
> > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > > 100644
> > > --- a/arch/x86/lib/csum-partial_64.c
> > > +++ b/arch/x86/lib/csum-partial_64.c
> > > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > > if (unlikely(odd)) {
> > > if (unlikely(len == 0))
> > > return sum;
> > > + temp64 = ror32((__force u64)sum, 8);
> > > temp64 += (*(unsigned char *)buff << 8);
> > > len--;
> > > buff++;
> > >
> > >
> >
> > It is a bit late here, I will test the following later this week.
> >
> > We probably can remove one conditional jump at the end of the function
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > if (unlikely(odd)) {
> > if (unlikely(len == 0))
> > return sum;
> > + temp64 = ror32((__force u64)sum, 8);
> > temp64 += (*(unsigned char *)buff << 8);
> > len--;
> > buff++;
> > + odd = 8;
> > }
> >
> > while (unlikely(len >= 64)) {
> > @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > #endif
> > }
> > result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> > - if (unlikely(odd)) {
> > - result = from32to16(result);
> > - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> > - }
> > + ror32(result, odd);
>
> this would be
> result = ror32(result, odd);
>
> definitely time to stop working today for me.
>
> > return (__force __wsum)result;
> > }
> > EXPORT_SYMBOL(csum_partial);
All my tests pass with that change :)
On Wed, Nov 24, 2021 at 10:56 PM Noah Goldstein <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 10:20 PM Eric Dumazet <[email protected]> wrote:
> >
> > On Wed, Nov 24, 2021 at 8:08 PM Eric Dumazet <[email protected]> wrote:
> > >
> > > On Wed, Nov 24, 2021 at 8:00 PM Eric Dumazet <[email protected]> wrote:
> > >
> > > >
> > > > It is an issue in general, not in standard cases because network
> > > > headers are aligned.
> > > >
> > > > I think it came when I folded csum_partial() and do_csum(), I forgot
> > > > to ror() the seed.
> > > >
> > > > I suspect the following would help:
> > > >
> > > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > > > 100644
> > > > --- a/arch/x86/lib/csum-partial_64.c
> > > > +++ b/arch/x86/lib/csum-partial_64.c
> > > > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > > > if (unlikely(odd)) {
> > > > if (unlikely(len == 0))
> > > > return sum;
> > > > + temp64 = ror32((__force u64)sum, 8);
> > > > temp64 += (*(unsigned char *)buff << 8);
> > > > len--;
> > > > buff++;
> > > >
> > > >
> > >
> > > It is a bit late here, I will test the following later this week.
> > >
> > > We probably can remove one conditional jump at the end of the function
> > >
> > > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..15986ad42ed5ccb8241ff467a34188cf901ec98e
> > > 100644
> > > --- a/arch/x86/lib/csum-partial_64.c
> > > +++ b/arch/x86/lib/csum-partial_64.c
> > > @@ -41,9 +41,11 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > > if (unlikely(odd)) {
> > > if (unlikely(len == 0))
> > > return sum;
> > > + temp64 = ror32((__force u64)sum, 8);
> > > temp64 += (*(unsigned char *)buff << 8);
> > > len--;
> > > buff++;
> > > + odd = 8;
> > > }
> > >
> > > while (unlikely(len >= 64)) {
> > > @@ -129,10 +131,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > > #endif
> > > }
> > > result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> > > - if (unlikely(odd)) {
> > > - result = from32to16(result);
> > > - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> > > - }
> > > + ror32(result, odd);
> >
> > this would be
> > result = ror32(result, odd);
> >
> > definitely time to stop working today for me.
> >
> > > return (__force __wsum)result;
> > > }
> > > EXPORT_SYMBOL(csum_partial);
>
> All my tests pass with that change :)
Although I see slightly worse performance with aligned `buff` in
the branch-free approach. Imagine if non-aligned `buff` is that
uncommon might be better to speculate past the work of `ror`.
On Wed, Nov 24, 2021 at 9:09 PM Noah Goldstein <[email protected]> wrote:
>
>
>
> Although I see slightly worse performance with aligned `buff` in
> the branch-free approach. Imagine if non-aligned `buff` is that
> uncommon might be better to speculate past the work of `ror`.
Yes, no clear win here removing the conditional (same cost really),
although using a ror32() is removing the from32to16() helper and get
rid of one folding.
I will formally submit this change, thanks !
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..cf4bd3ef66e56c681b3435d43011ece78438376d
100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -11,16 +11,6 @@
#include <asm/checksum.h>
#include <asm/word-at-a-time.h>
-static inline unsigned short from32to16(unsigned a)
-{
- unsigned short b = a >> 16;
- asm("addw %w2,%w0\n\t"
- "adcw $0,%w0\n"
- : "=r" (b)
- : "0" (b), "r" (a));
- return b;
-}
-
/*
* Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
@@ -41,6 +31,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
if (unlikely(odd)) {
if (unlikely(len == 0))
return sum;
+ temp64 = ror32((__force u32)sum, 8);
temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
@@ -129,10 +120,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
#endif
}
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
- if (unlikely(odd)) {
- result = from32to16(result);
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
- }
+ if (unlikely(odd))
+ result = ror32(result, 8);
return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
On Wed, Nov 24, 2021 at 10:32 PM Eric Dumazet <[email protected]> wrote:
> - }
> + if (unlikely(odd))
> + result = ror32(result, 8);
> return (__force __wsum)result;
Oh well, gcc at least removes the conditional and generates a ror and a cmov
mov %edx,%eax
ror $0x8,%eax
test %r8,%r8
cmove %edx,%eax
ret
clang keeps the cond jmp
test $0x1,%dil
je 93
rol $0x18,%eax
93: ret
On Thu, Nov 25, 2021 at 12:32 AM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 9:09 PM Noah Goldstein <[email protected]> wrote:
> >
> >
> >
> > Although I see slightly worse performance with aligned `buff` in
> > the branch-free approach. Imagine if non-aligned `buff` is that
> > uncommon might be better to speculate past the work of `ror`.
>
> Yes, no clear win here removing the conditional (same cost really),
> although using a ror32() is removing the from32to16() helper and get
> rid of one folding.
>
> I will formally submit this change, thanks !
Great :)
Can you put me on the cc list? I have a patch for the function that I'll post
once yours gets through.
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..cf4bd3ef66e56c681b3435d43011ece78438376d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -11,16 +11,6 @@
> #include <asm/checksum.h>
> #include <asm/word-at-a-time.h>
>
> -static inline unsigned short from32to16(unsigned a)
> -{
> - unsigned short b = a >> 16;
> - asm("addw %w2,%w0\n\t"
> - "adcw $0,%w0\n"
> - : "=r" (b)
> - : "0" (b), "r" (a));
> - return b;
> -}
> -
> /*
> * Do a checksum on an arbitrary memory area.
> * Returns a 32bit checksum.
> @@ -41,6 +31,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> if (unlikely(odd)) {
> if (unlikely(len == 0))
> return sum;
> + temp64 = ror32((__force u32)sum, 8);
> temp64 += (*(unsigned char *)buff << 8);
> len--;
> buff++;
> @@ -129,10 +120,8 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> #endif
> }
> result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
> - if (unlikely(odd)) {
> - result = from32to16(result);
> - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
> - }
> + if (unlikely(odd))
> + result = ror32(result, 8);
> return (__force __wsum)result;
> }
> EXPORT_SYMBOL(csum_partial);
On Thu, Nov 25, 2021 at 12:46 AM Eric Dumazet <[email protected]> wrote:
>
> On Wed, Nov 24, 2021 at 10:32 PM Eric Dumazet <[email protected]> wrote:
>
> > - }
> > + if (unlikely(odd))
> > + result = ror32(result, 8);
> > return (__force __wsum)result;
>
> Oh well, gcc at least removes the conditional and generates a ror and a cmov
Seems like a missed optimization for `unlikely` where dependency breaking
is pretty common.
Although still saves a uop because of `imm8` operand.
>
> mov %edx,%eax
> ror $0x8,%eax
> test %r8,%r8
> cmove %edx,%eax
> ret
>
> clang keeps the cond jmp
> test $0x1,%dil
> je 93
> rol $0x18,%eax
> 93: ret
From: Eric Dumazet
> Sent: 25 November 2021 04:01
...
> > The outputs seem to match if `buff` is aligned to 64-bit. Still see
> > difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
> >
> > The comment at the top says it's "best" to have `buff` 64-bit aligned but
> > the code logic seems meant to support the misaligned case so not
> > sure if it's an issue.
> >
>
> It is an issue in general, not in standard cases because network
> headers are aligned.
>
> I think it came when I folded csum_partial() and do_csum(), I forgot
> to ror() the seed.
>
> I suspect the following would help:
>
> diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> 100644
> --- a/arch/x86/lib/csum-partial_64.c
> +++ b/arch/x86/lib/csum-partial_64.c
> @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> if (unlikely(odd)) {
> if (unlikely(len == 0))
> return sum;
> + temp64 = ror32((__force u64)sum, 8);
> temp64 += (*(unsigned char *)buff << 8);
> len--;
> buff++;
You can save an instruction (as if this path matters) by:
temp64 = sum + *(unsigned char *)buff;
temp64 <<= 8;
Although that probably falls foul of 64bit shifts being slow.
So maybe just:
sum += *(unsigned char *)buff;
temp64 = bswap32(sum);
AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
the same speed but may use different execution units.
Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
in sandy bridge - and still not fixed it.
Although the compiler might be making a pigs-breakfast of the
register allocation when you tried setting 'odd = 8'.
Weeks can be spent fiddling with this code :-(
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Fri, Nov 26, 2021 at 9:18 AM David Laight <[email protected]> wrote:
>
> From: Eric Dumazet
> > Sent: 25 November 2021 04:01
> ...
> > > The outputs seem to match if `buff` is aligned to 64-bit. Still see
> > > difference with `csum_fold(csum_partial())` if `buff` is not 64-bit aligned.
> > >
> > > The comment at the top says it's "best" to have `buff` 64-bit aligned but
> > > the code logic seems meant to support the misaligned case so not
> > > sure if it's an issue.
> > >
> >
> > It is an issue in general, not in standard cases because network
> > headers are aligned.
> >
> > I think it came when I folded csum_partial() and do_csum(), I forgot
> > to ror() the seed.
> >
> > I suspect the following would help:
> >
> > diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
> > index 1eb8f2d11f7c785be624eba315fe9ca7989fd56d..ee7b0e7a6055bcbef42d22f7e1d8f52ddbd6be6d
> > 100644
> > --- a/arch/x86/lib/csum-partial_64.c
> > +++ b/arch/x86/lib/csum-partial_64.c
> > @@ -41,6 +41,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
> > if (unlikely(odd)) {
> > if (unlikely(len == 0))
> > return sum;
> > + temp64 = ror32((__force u64)sum, 8);
> > temp64 += (*(unsigned char *)buff << 8);
> > len--;
> > buff++;
>
> You can save an instruction (as if this path matters) by:
> temp64 = sum + *(unsigned char *)buff;
This might overflow, sum is 32bit.
Given that we have temp64 = (__force u64)sum; already done at this
point, we can instead
temp64 += *(u8 *)buff;
> temp64 <<= 8;
> Although that probably falls foul of 64bit shifts being slow.
Are they slower than the ror32(sum, 8) ?
> So maybe just:
> sum += *(unsigned char *)buff;
we might miss a carry/overflow here
> temp64 = bswap32(sum);
> AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> the same speed but may use different execution units.
>
> Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> in sandy bridge - and still not fixed it.
> Although the compiler might be making a pigs-breakfast of the
> register allocation when you tried setting 'odd = 8'.
>
> Weeks can be spent fiddling with this code :-(
Yes, and in the end, it won't be able to compete with a
specialized/inlined ipv6_csum_partial()
From: Eric Dumazet
> Sent: 26 November 2021 18:10
...
> > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > the same speed but may use different execution units.
The 64bit shifts/rotates are also only one clock.
It is the bswap64 that can be two.
> > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > in sandy bridge - and still not fixed it.
> > Although the compiler might be making a pigs-breakfast of the
> > register allocation when you tried setting 'odd = 8'.
> >
> > Weeks can be spent fiddling with this code :-(
>
> Yes, and in the end, it won't be able to compete with a
> specialized/inlined ipv6_csum_partial()
I bet most of the gain comes from knowing there is a non-zero
whole number of 32bit words.
The pesky edge conditions cost.
And even then you need to get it right!
The one for summing the 5-word IPv4 header is actually horrid
on Intel cpu prior to Haswell because 'adc' has a latency of 2.
On Sandy bridge the carry output is valid on the next clock,
so adding to alternate registers doubles throughput.
(That could easily be done in the current function and will
make a big different on those cpu.)
But basically the current generic code has the loop unrolled
further than is necessary for modern (non-atom) cpu.
That just adds more code outside the loop.
I did managed to get 12 bytes/clock using adco/adox with only
32 bytes each iteration.
That will require aligned buffers.
Alignment won't matter for 'adc' loops because there are two
'memory read' units - but there is the elephant:
Sandy bridge Cache bank conflicts
Each consecutive 128 bytes, or two cache lines, in the data cache is divided
into 8 banks of 16 bytes each. It is not possible to do two memory reads in
the same clock cycle if the two memory addresses have the same bank number,
i.e. if bit 4 - 6 in the two addresses are the same.
; Example 9.5. Sandy bridge cache bank conflict
mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict
That isn't a problem on Haswell, but it is probably worth ordering
the 'adc' in the loop to reduce the number of conflicts.
I didn't try to look for that though.
I only remember testing aligned buffers on Sandy/Ivy bridge.
Adding to alternate registers helped no end.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Fri, Nov 26, 2021 at 4:41 PM David Laight <[email protected]> wrote:
>
> From: Eric Dumazet
> > Sent: 26 November 2021 18:10
> ...
> > > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > > the same speed but may use different execution units.
>
> The 64bit shifts/rotates are also only one clock.
> It is the bswap64 that can be two.
>
> > > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > > in sandy bridge - and still not fixed it.
> > > Although the compiler might be making a pigs-breakfast of the
> > > register allocation when you tried setting 'odd = 8'.
> > >
> > > Weeks can be spent fiddling with this code :-(
> >
> > Yes, and in the end, it won't be able to compete with a
> > specialized/inlined ipv6_csum_partial()
>
> I bet most of the gain comes from knowing there is a non-zero
> whole number of 32bit words.
> The pesky edge conditions cost.
>
> And even then you need to get it right!
> The one for summing the 5-word IPv4 header is actually horrid
> on Intel cpu prior to Haswell because 'adc' has a latency of 2.
> On Sandy bridge the carry output is valid on the next clock,
> so adding to alternate registers doubles throughput.
> (That could easily be done in the current function and will
> make a big different on those cpu.)
>
> But basically the current generic code has the loop unrolled
> further than is necessary for modern (non-atom) cpu.
> That just adds more code outside the loop.
>
> I did managed to get 12 bytes/clock using adco/adox with only
> 32 bytes each iteration.
> That will require aligned buffers.
>
> Alignment won't matter for 'adc' loops because there are two
> 'memory read' units - but there is the elephant:
>
> Sandy bridge Cache bank conflicts
> Each consecutive 128 bytes, or two cache lines, in the data cache is divided
> into 8 banks of 16 bytes each. It is not possible to do two memory reads in
> the same clock cycle if the two memory addresses have the same bank number,
> i.e. if bit 4 - 6 in the two addresses are the same.
> ; Example 9.5. Sandy bridge cache bank conflict
> mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
> mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
> mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict
>
> That isn't a problem on Haswell, but it is probably worth ordering
> the 'adc' in the loop to reduce the number of conflicts.
> I didn't try to look for that though.
> I only remember testing aligned buffers on Sandy/Ivy bridge.
> Adding to alternate registers helped no end.
Cant that just be solved by having the two independent adcx/adox chains work
from region that are 16+ bytes apart? For 40 byte ipv6 header it will be simple.
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
From: Noah Goldstein
> Sent: 26 November 2021 23:04
>
> On Fri, Nov 26, 2021 at 4:41 PM David Laight <[email protected]> wrote:
> >
> > From: Eric Dumazet
> > > Sent: 26 November 2021 18:10
> > ...
> > > > AFAICT (from a pdf) bswap32() and ror(x, 8) are likely to be
> > > > the same speed but may use different execution units.
> >
> > The 64bit shifts/rotates are also only one clock.
> > It is the bswap64 that can be two.
> >
> > > > Intel seem so have managed to slow down ror(x, %cl) to 3 clocks
> > > > in sandy bridge - and still not fixed it.
> > > > Although the compiler might be making a pigs-breakfast of the
> > > > register allocation when you tried setting 'odd = 8'.
> > > >
> > > > Weeks can be spent fiddling with this code :-(
> > >
> > > Yes, and in the end, it won't be able to compete with a
> > > specialized/inlined ipv6_csum_partial()
> >
> > I bet most of the gain comes from knowing there is a non-zero
> > whole number of 32bit words.
> > The pesky edge conditions cost.
> >
> > And even then you need to get it right!
> > The one for summing the 5-word IPv4 header is actually horrid
> > on Intel cpu prior to Haswell because 'adc' has a latency of 2.
> > On Sandy bridge the carry output is valid on the next clock,
> > so adding to alternate registers doubles throughput.
> > (That could easily be done in the current function and will
> > make a big different on those cpu.)
> >
> > But basically the current generic code has the loop unrolled
> > further than is necessary for modern (non-atom) cpu.
> > That just adds more code outside the loop.
> >
> > I did managed to get 12 bytes/clock using adco/adox with only
> > 32 bytes each iteration.
> > That will require aligned buffers.
> >
> > Alignment won't matter for 'adc' loops because there are two
> > 'memory read' units - but there is the elephant:
> >
> > Sandy bridge Cache bank conflicts
> > Each consecutive 128 bytes, or two cache lines, in the data cache is divided
> > into 8 banks of 16 bytes each. It is not possible to do two memory reads in
> > the same clock cycle if the two memory addresses have the same bank number,
> > i.e. if bit 4 - 6 in the two addresses are the same.
> > ; Example 9.5. Sandy bridge cache bank conflict
> > mov eax, [rsi] ; Use bank 0, assuming rsi is divisible by 40H
> > mov ebx, [rsi+100H] ; Use bank 0. Cache bank conflict
> > mov ecx, [rsi+110H] ; Use bank 1. No cache bank conflict
> >
> > That isn't a problem on Haswell, but it is probably worth ordering
> > the 'adc' in the loop to reduce the number of conflicts.
> > I didn't try to look for that though.
> > I only remember testing aligned buffers on Sandy/Ivy bridge.
> > Adding to alternate registers helped no end.
>
> Cant that just be solved by having the two independent adcx/adox chains work
> from region that are 16+ bytes apart? For 40 byte ipv6 header it will be simple.
Not relevant, adcx/adox are only supported haswell/broadwell onwards
which don't have the 'cache bank conflict' issue.
In any case using adx[oc] for only 40 bytes isn't worth the effort.
The other issue with adcx/adoc is that some cpu that support them
have very slow decode times - so unless you' got them in a loop
it will be horrid.
Trying to 'loop carry' both the 'carry' and 'overflow' flags is also
fraught. The 'loop' instruction would do it - but that is horribly
slow on Intel cpu (I think it is ok an AMD ones).
You can use jcxz at the top of the loop and an unconditional jump at the bottom.
There might be an obscure method of doing a 64bit->32bit move into %recx
and then a jrcxz at the loop bottom!
For Ivy/Sandy bridge it is noted:
There is hardly any penalty for misaligned memory access with operand sizes
of 64 bits or less, except for the effect of using multiple cache banks.
That might mean that you can do a misaligned read every clock.
With the only issues arising for that that is trying to do 2 reads/clock.
Given the checksum code needs to do 'adc', the carry flag constrains
you to 1 read/clock - so there may actually be no real penalty for
a misaligned buffer at all.
No one (except me) has actually noticed that the adc chain takes two
clocks per adc on sandy bridge, so if the misaligned memory reads
take two clocks it makes no difference.
(info from pdf's from http://www.agner.org/optimize)
I've not got the test systems and program I used back in May 2020
to hand any more.
I certainly found that efficiently handling the 'odd 7 bytes'
was actually more difficult than it might seem.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Thu, Nov 18, 2021 at 05:00:58PM +0100, Peter Zijlstra wrote:
> Dear um folks, is this indeed the best solution? It's a bit sad to have
> to add this to x86_64, but if that's the way it is...
Something like this, perhaps? (absolutely untested)
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index e5a7b552bb384..a58b67ec8119d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -23,7 +23,6 @@ generic-y += softirq_stack.h
generic-y += switch_to.h
generic-y += topology.h
generic-y += trace_clock.h
-generic-y += word-at-a-time.h
generic-y += kprobes.h
generic-y += mm_hooks.h
generic-y += vga.h
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 95d26a69088b1..a812cc35092c6 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -9,6 +9,7 @@ endmenu
config UML_X86
def_bool y
select GENERIC_FIND_FIRST_BIT
+ select DCACHE_WORD_ACCESS
config 64BIT
bool "64-bit kernel" if "$(SUBARCH)" = "x86"
On Wed, Dec 29, 2021 at 06:00:56AM +0000, Al Viro wrote:
> On Thu, Nov 18, 2021 at 05:00:58PM +0100, Peter Zijlstra wrote:
>
> > Dear um folks, is this indeed the best solution? It's a bit sad to have
> > to add this to x86_64, but if that's the way it is...
>
> Something like this, perhaps? (absolutely untested)
[snip]
AFAICS, this (on top of current mainline) does the right thing.
commit 6692531df62d812de5d22c8bca0d90edc163aa84
Author: Al Viro <[email protected]>
Date: Sun Jan 30 21:25:53 2022 -0500
uml/x86: use x86 load_unaligned_zeropad()
allows, among other things, to drop !DCACHE_WORD_ACCESS mess in
x86 csum-partial_64.c
Signed-off-by: Al Viro <[email protected]>
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index e5a7b552bb384..a58b67ec8119d 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -23,7 +23,6 @@ generic-y += softirq_stack.h
generic-y += switch_to.h
generic-y += topology.h
generic-y += trace_clock.h
-generic-y += word-at-a-time.h
generic-y += kprobes.h
generic-y += mm_hooks.h
generic-y += vga.h
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 1f8a8f8951738..50734a23034c4 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -93,7 +93,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
buff += 8;
}
if (len & 7) {
-#ifdef CONFIG_DCACHE_WORD_ACCESS
unsigned int shift = (8 - (len & 7)) * 8;
unsigned long trail;
@@ -103,31 +102,6 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
"adcq $0,%[res]"
: [res] "+r" (temp64)
: [trail] "r" (trail));
-#else
- if (len & 4) {
- asm("addq %[val],%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r" (temp64)
- : [val] "r" ((u64)*(u32 *)buff)
- : "memory");
- buff += 4;
- }
- if (len & 2) {
- asm("addq %[val],%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r" (temp64)
- : [val] "r" ((u64)*(u16 *)buff)
- : "memory");
- buff += 2;
- }
- if (len & 1) {
- asm("addq %[val],%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r" (temp64)
- : [val] "r" ((u64)*(u8 *)buff)
- : "memory");
- }
-#endif
}
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
if (unlikely(odd)) {
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 40d6a06e41c81..4eb47d3ba6250 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -8,6 +8,7 @@ endmenu
config UML_X86
def_bool y
+ select DCACHE_WORD_ACCESS
config 64BIT
bool "64-bit kernel" if "$(SUBARCH)" = "x86"