TCP Segmentation Offloading (TSO) is enabled[1] in 2.5.33, along with an
enabled e1000 driver. Other capable devices can be enabled ala e1000; the
driver interface (NETIF_F_TSO) is very simple.
So, fire up you favorite networking performance tool and compare the
performance gains between 2.5.32 and 2.5.33 using e1000. I ran a quick test
on a dual P4 workstation system using the commercial tool Chariot:
Tx/Rx TCP file send long (bi-directional Rx/Tx)
w/o TSO: 1500Mbps, 82% CPU
w/ TSO: 1633Mbps, 75% CPU
Tx TCP file send long (Tx only)
w/o TSO: 940Mbps, 40% CPU
w/ TSO: 940Mbps, 19% CPU
A good bump in throughput for the bi-directional test. The Tx-only test was
already at wire speed, so the gains are pure CPU savings.
I'd like to see SPECWeb results w/ and w/o TSO, and any other relevant
testing. UDP framentation is not offloaded, so keep testing to TCP.
-scott
[1] Kudos to Alexey Kuznetsov for enabling the stack with TSO support, to
Chris Leech for providing the e1000 bits and a prototype stack, and to David
Miller for consultation.
Hello!
> [1] Kudos to
Hmm... wait awhile with celebrating, the implementation in tcp is still
at level of a toy. Well, and it happens to crash, the patch is enclosed.
Alexey
--- linux/net/ipv4/tcp_output.c.orig Sat Aug 31 17:43:36 2002
+++ linux/net/ipv4/tcp_output.c Mon Sep 2 22:48:16 2002
@@ -477,6 +477,56 @@
return 0;
}
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+unsigned char * __pskb_trim_head(struct sk_buff *skb, int len)
+{
+ int i, k, eat;
+
+ eat = len;
+ k = 0;
+ for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+ if (skb_shinfo(skb)->frags[i].size <= eat) {
+ put_page(skb_shinfo(skb)->frags[i].page);
+ eat -= skb_shinfo(skb)->frags[i].size;
+ } else {
+ skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ if (eat) {
+ skb_shinfo(skb)->frags[k].page_offset += eat;
+ skb_shinfo(skb)->frags[k].size -= eat;
+ eat = 0;
+ }
+ k++;
+ }
+ }
+ skb_shinfo(skb)->nr_frags = k;
+
+ skb->tail = skb->data;
+ skb->data_len -= len;
+ skb->len = skb->data_len;
+ return skb->tail;
+}
+
+static int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ if (len <= skb_headlen(skb)) {
+ __skb_pull(skb, len);
+ } else {
+ if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
+ return -ENOMEM;
+ }
+
+ TCP_SKB_CB(skb)->seq += len;
+ skb->ip_summed = CHECKSUM_HW;
+ return 0;
+}
+
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -836,8 +886,6 @@
return -EAGAIN;
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
- struct sk_buff *skb2;
-
if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
BUG();
@@ -847,13 +895,8 @@
tp->mss_cache = tp->mss_cache_std;
}
- if(tcp_fragment(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
return -ENOMEM;
-
- skb2 = skb->next;
- __skb_unlink(skb, skb->list);
- tcp_free_skb(sk, skb);
- skb = skb2;
}
/* If receiver has shrunk his window, and skb is out of
Feldman, Scott wrote:
> TCP Segmentation Offloading (TSO) is enabled[1] in 2.5.33, along with an
> enabled e1000 driver. Other capable devices can be enabled ala e1000; the
> driver interface (NETIF_F_TSO) is very simple.
>
> So, fire up you favorite networking performance tool and compare the
> performance gains between 2.5.32 and 2.5.33 using e1000. I ran a quick test
> on a dual P4 workstation system using the commercial tool Chariot:
>
> Tx/Rx TCP file send long (bi-directional Rx/Tx)
> w/o TSO: 1500Mbps, 82% CPU
> w/ TSO: 1633Mbps, 75% CPU
>
> Tx TCP file send long (Tx only)
> w/o TSO: 940Mbps, 40% CPU
> w/ TSO: 940Mbps, 19% CPU
>
> A good bump in throughput for the bi-directional test. The Tx-only test was
> already at wire speed, so the gains are pure CPU savings.
>
> I'd like to see SPECWeb results w/ and w/o TSO, and any other relevant
> testing. UDP framentation is not offloaded, so keep testing to TCP.
Are there docs or other drivers about?
8139C+ chip can do TSO, so I would like to implement support.
Jeff
I would like to praise Intel for working so closely with us on
this. They gave us immediately, in one email, all the information we
needed to implement and test e1000 support for TSO under Linux.
With some other companies, doing this is like pulling teeth.
One question regarding the throughput numbers,
what was the size of the packets built at the tcp layer (mss)?
i assume the mtu is ethernet 1500 Bytes, right? and that mss should be
something much bigger than mtu, which gives the performance improvement
shown in the numbers.
thanks,
jordi
-----Original Message-----
From: [email protected]
[mailto:[email protected]]On Behalf Of Feldman, Scott
Sent: Monday, September 02, 2002 10:45 AM
To: [email protected]; linux-net; 'Dave Hansen';
'[email protected]'
Cc: [email protected]; 'David S. Miller'; Leech, Christopher
Subject: TCP Segmentation Offloading (TSO)
TCP Segmentation Offloading (TSO) is enabled[1] in 2.5.33, along with an
enabled e1000 driver. Other capable devices can be enabled ala e1000; the
driver interface (NETIF_F_TSO) is very simple.
So, fire up you favorite networking performance tool and compare the
performance gains between 2.5.32 and 2.5.33 using e1000. I ran a quick test
on a dual P4 workstation system using the commercial tool Chariot:
Tx/Rx TCP file send long (bi-directional Rx/Tx)
w/o TSO: 1500Mbps, 82% CPU
w/ TSO: 1633Mbps, 75% CPU
Tx TCP file send long (Tx only)
w/o TSO: 940Mbps, 40% CPU
w/ TSO: 940Mbps, 19% CPU
A good bump in throughput for the bi-directional test. The Tx-only test was
already at wire speed, so the gains are pure CPU savings.
I'd like to see SPECWeb results w/ and w/o TSO, and any other relevant
testing. UDP framentation is not offloaded, so keep testing to TCP.
-scott
[1] Kudos to Alexey Kuznetsov for enabling the stack with TSO support, to
Chris Leech for providing the e1000 bits and a prototype stack, and to David
Miller for consultation.
From: "Jordi Ros" <[email protected]>
Date: Mon, 2 Sep 2002 21:58:32 -0700
i assume the mtu is ethernet 1500 Bytes, right? and that mss should be
something much bigger than mtu, which gives the performance improvement
shown in the numbers.
The performance improvement comes from the fact that the card
is given huge 64K packets, then the card (using the given ip/tcp
headers as a template) spits out 1500 byte mtu sized packets.
Less data DMA'd to the device per normal-mtu packet and less
per-packet data structure work by the cpu is where the improvement
comes from.
What i am wondering is how come we only get a few percentage improvement in
throughput. Theoretically, since 64KB/1.5KB ~= 40, we should get a
throughput improvement of 40 times. That would be the case of udp
transmiting in one direction, in the case of tcp transmiting in one
direction (which is the one you have implemented), since in average we have
(at most) 1 ack every 2 data packets, we should theoretically obtain a
throughput improvement of (40+20)/(1+20) = 3 (this comes from: without tso
we send 40 packets and receive 20 acks, this is, the cpu processes 60
packets; whereas with tso we send 1 packet and receive 20 acks, this is, the
cpu processes 21 packets).
However, we don't see in the numbers obtained neither an increase of
throughput of 300% nor a decrease in cpu utilization of such magnitude. Is
there any other bottleneck in the system that prevents us to see the 300%
improvement? (i am assuming the card can do tso at wire speed)
thank you,
jordi
These improvement should be reflected in terms of cpu offloading and
-----Original Message-----
From: [email protected]
[mailto:[email protected]]On Behalf Of David S. Miller
Sent: Monday, September 02, 2002 11:53 PM
To: [email protected]
Cc: [email protected]; [email protected];
[email protected]; [email protected]; [email protected];
[email protected]; [email protected]
Subject: Re: TCP Segmentation Offloading (TSO)
From: "Jordi Ros" <[email protected]>
Date: Mon, 2 Sep 2002 21:58:32 -0700
i assume the mtu is ethernet 1500 Bytes, right? and that mss should be
something much bigger than mtu, which gives the performance improvement
shown in the numbers.
The performance improvement comes from the fact that the card
is given huge 64K packets, then the card (using the given ip/tcp
headers as a template) spits out 1500 byte mtu sized packets.
Less data DMA'd to the device per normal-mtu packet and less
per-packet data structure work by the cpu is where the improvement
comes from.
From: "Jordi Ros" <[email protected]>
Date: Tue, 3 Sep 2002 00:26:13 -0700
What i am wondering is how come we only get a few percentage
improvement in throughput
Because he's maxing out the physical medium already.
All the headers for each 1500 byte packet still have to hit the
physical wire, that isn't what is being eliminated. It's just
what is going over the PCI bus to the card that is being made
smaller.
Hello!
kuznet> > [1] Kudos to
kuznet>
kuznet> Hmm... wait awhile with celebrating, the implementation in tcp is still
kuznet> at level of a toy. Well, and it happens to crash, the patch is enclosed.
I guess it may also depend on bad implementations of csum_partial().
It's wrong that some architecture assume every data in a skbuff are
aligned on a 2byte boundary so that it would access a byte next to
the the last byte where no pages might be there.
And we should know sendfile systemcall also pass pages with any offsets
and any byte while csum_partial() may be called from everywhere
in the kernel including device drivers.
It's time to fix csum_partial().
P.S.
Using "bswap" is little bit tricky.
Regards,
Hirokazu Takahashi
VP Engineering Dept.
VA Linux Systems Japan
--- linux/arch/i386/lib/checksum.S.BUG Sun Sep 1 17:00:59 2030
+++ linux/arch/i386/lib/checksum.S Mon Sep 2 13:09:09 2030
@@ -126,8 +126,8 @@ csum_partial:
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
- testl $2, %esi
- jnz 30f
+ testl $3, %esi
+ jnz 25f
10:
movl %ecx, %edx
movl %ecx, %ebx
@@ -145,6 +145,20 @@ csum_partial:
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ bswap %eax
+ movzbl (%esi), %ebx
+ shll $8, %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
30: subl $2, %ecx
ja 20b
@@ -211,6 +225,10 @@ csum_partial:
addl %ebx,%eax
adcl $0,%eax
80:
+ testl $1, 12(%esp)
+ jz 90f
+ bswap %eax
+90:
popl %ebx
popl %esi
ret
From: Hirokazu Takahashi <[email protected]>
Date: Tue, 03 Sep 2002 16:42:43 +0900 (JST)
I guess it may also depend on bad implementations of csum_partial().
It's wrong that some architecture assume every data in a skbuff are
aligned on a 2byte boundary so that it would access a byte next to
the the last byte where no pages might be there.
It is real requirement, x86 works because unaligned
access is handled transparently by cpu.
But on every non-x86 csum_partial I have examined, worse than
2-byte aligned data start is totally not handled. It is not difficult
to figure out why this is the case, everyone has copied by example. :-)
So we must make a decision, either make every csum_partial
implementation eat 1 byte alignment or enforce 2-byte
alignment at call sites.
I think for 2.4.x it is unreasonable to force everyone to change their
csum_partial, especially since handling byte aligned buffer requires
holding onto state across the whole checksum from beginning to the
last fold. Many RISC implementation are using registers to the max
and may not easily be able to obtain a temporary.
I dealt with a bug in this area recently, pppoe can cause ppp_input to
send byte aligned data in packets to TCP input, this crashes just
about every non-x86 system so my "fix" was to copy byte aligned SKBs
in ppp_input.
"David S. Miller" <[email protected]> writes:
> From: Hirokazu Takahashi <[email protected]>
> Date: Tue, 03 Sep 2002 16:42:43 +0900 (JST)
>
> I guess it may also depend on bad implementations of csum_partial().
> It's wrong that some architecture assume every data in a skbuff are
> aligned on a 2byte boundary so that it would access a byte next to
> the the last byte where no pages might be there.
>
> It is real requirement, x86 works because unaligned
> access is handled transparently by cpu.
>
> But on every non-x86 csum_partial I have examined, worse than
> 2-byte aligned data start is totally not handled. It is not difficult
> to figure out why this is the case, everyone has copied by example. :-)
x86-64 handles it (also in csum-copy). I think at least Alpha does it
too (that is where I stole the C csum-partial base from) But it's ugly.
See the odd hack.
-Andi
From: Andi Kleen <[email protected]>
Date: 03 Sep 2002 11:05:30 +0200
x86-64 handles it (also in csum-copy). I think at least Alpha does it
too (that is where I stole the C csum-partial base from) But it's ugly.
See the odd hack.
Ok I think we really need to fix this then in the arches
where broken. Let's do an audit. :-)
I question if x86 is broken at all. It checks odd lengths
and x86 handles odd memory accesses transparently. Please,
some x86 guru make some comments here :-)
It looks like sparc64 is the only platform where oddly aligned buffer
can truly cause problems and I can fix that easily enough.
On Tue, Sep 03, 2002 at 03:00:25AM -0700, David S. Miller wrote:
> From: Andi Kleen <[email protected]>
> Date: 03 Sep 2002 11:05:30 +0200
>
> x86-64 handles it (also in csum-copy). I think at least Alpha does it
> too (that is where I stole the C csum-partial base from) But it's ugly.
> See the odd hack.
>
> Ok I think we really need to fix this then in the arches
> where broken. Let's do an audit. :-)
Yes, it's needed because users can pass unaligned addresses in from
userspace to sendmsg
>
> I question if x86 is broken at all. It checks odd lengths
> and x86 handles odd memory accesses transparently. Please,
> some x86 guru make some comments here :-)
x86 is just slower for this case because all accesses will eat the
unaligned penalty, but should work.
I could have done it this way on x86-64 too, but chose to handle it.
> It looks like sparc64 is the only platform where oddly aligned buffer
> can truly cause problems and I can fix that easily enough.
It could allow everybody to generate packets with bogus addresses on
the network.
I suspect on sparc64 it will just be all handled by the unalignment handler
in the kernel ? If yes it will be incredibly slow, but should work.
-Andi
From: Andi Kleen <[email protected]>
Date: Tue, 3 Sep 2002 12:10:41 +0200
On Tue, Sep 03, 2002 at 03:00:25AM -0700, David S. Miller wrote:
> Ok I think we really need to fix this then in the arches
> where broken. Let's do an audit. :-)
Yes, it's needed because users can pass unaligned addresses in from
userspace to sendmsg
I know, that is old news. New case is csum_partial since
sys_sendfile can use byte aligned page offsets.
Traditionally this really didn't ever happen because
skb->data was always nicely aligned.
> It looks like sparc64 is the only platform where oddly aligned buffer
> can truly cause problems and I can fix that easily enough.
It could allow everybody to generate packets with bogus addresses on
the network.
No, the sparc64 case generates an OOPS for csum_partial only.
csum_partial_copy_*() is perfectly fine for byte aligned buffers.
I suspect on sparc64 it will just be all handled by the unalignment handler
in the kernel ? If yes it will be incredibly slow, but should work.
That's what happens now as long as the VIS instructions don't
get used. The one case where oddly aligned buffer address is
not checked to avoid using VIS is csum_partial.
The unaligned trap handler doesn't handle the 64-byte VIS loads into
the fpu regs from kernel mode, and this is by choice. I'd rather back
down to an integer algorithm than actually handle this odd case.
David S. Miller writes:
> It is real requirement, x86 works because unaligned
> access is handled transparently by cpu.
>
> But on every non-x86 csum_partial I have examined, worse than
> 2-byte aligned data start is totally not handled. It is not difficult
> to figure out why this is the case, everyone has copied by example. :-)
PPC and PPC64 are OK, I believe, since the CPU handles (almost) all
unaligned accesses in hardware. (Some PowerPC implementations trap if
the access crosses a page boundary but the newer ones even handle that
case in hardware, and if we do get the trap we fix it up.)
I notice though that if the length is odd, we (PPC) put the last byte
in the left-hand (most significant) byte of a 16-bit halfword, with
zero in the other byte, and add it in, whereas i386 puts the last byte
in the least-significant position. Hmmm... should be OK though since
I presume the result will be reduced and then converted to network
byte order before being put in the packet. And since there is an
end-around carry we should end up with the same bytes that i386 does.
Paul.
From: Paul Mackerras <[email protected]>
Date: Tue, 3 Sep 2002 21:27:59 +1000 (EST)
I notice though that if the length is odd, we (PPC) put the last byte
in the left-hand (most significant) byte of a 16-bit halfword, with
zero in the other byte, and add it in, whereas i386 puts the last byte
in the least-significant position.
What PPC does is correct for big endian.
Hello!
> I guess it may also depend on bad implementations of csum_partial().
> It's wrong that some architecture assume every data in a skbuff are
> aligned on a 2byte boundary so that it would access a byte next to
> the the last byte where no pages might be there.
Access beyond end of skb is officially allowed, within 16 bytes
in <= 2.2, withing 64 bytes in >=2.4. Moreover, it is not only allowed
but highly recommended, when this can ease coding.
> It's time to fix csum_partial().
Well, not depending on wrong accent put by you, the change is not useless.
Alexey
PS Gentlemen, it is not so bad idea to change subject and to trim cc list.
Thread is went to area straight orthogonal to TSO, csum_partial is not
used with TSO at all. :-)
Hello!
> > I guess it may also depend on bad implementations of csum_partial().
> > It's wrong that some architecture assume every data in a skbuff are
> > aligned on a 2byte boundary so that it would access a byte next to
> > the the last byte where no pages might be there.
>
> Access beyond end of skb is officially allowed, within 16 bytes
> in <= 2.2, withing 64 bytes in >=2.4. Moreover, it is not only allowed
> but highly recommended, when this can ease coding.
Skb may have some pages in its skb_shared_info as frags, but each
page may not have extra space in it while csum_partial() is used
to compute checksum against each page.
We can see skb_checksum() calls csum_partial() againt each page in skb.
No one knows whether the next page exits or not as it may be mapped
in kmap space.
Hmmm...
Only the implematation for x86 may have the problem that csum_partial()
may access beyond end of the page.
> > It's time to fix csum_partial().
>
> Well, not depending on wrong accent put by you, the change is not useless.
>
> Alexey
>
> PS Gentlemen, it is not so bad idea to change subject and to trim cc list.
> Thread is went to area straight orthogonal to TSO, csum_partial is not
> used with TSO at all. :-)
Hello!
> We can see skb_checksum() calls csum_partial() againt each page in skb.
Good point...
Dave, look, he says we will oops when sendfiling the last byte of a page,
and will have to call skb_checksum().
Alexey
Hello!
> Skb may have some pages in its skb_shared_info as frags, but each
> page may not have extra space in it while csum_partial() is used
> to compute checksum against each page.
>
> We can see skb_checksum() calls csum_partial() againt each page in skb.
> No one knows whether the next page exits or not as it may be mapped
> in kmap space.
It's sad that it happened on my machine.
And Oops said csum_partial() tried to accesse the next page which was not
kmapped yet.
Jordi Ros wrote:
> What i am wondering is how come we only get a few percentage
> improvement in throughput. Theoretically, since 64KB/1.5KB ~=
> 40, we should get a throughput improvement of 40 times.
You're confusing number of packets with throughput. Cut the wire, and you
can't tell the difference with or without TSO. It's the same amount of data
on the wire. As David pointed out, the savings comes in how much data is
DMA'ed across the bus and how much the CPU is unburdened by the segmentation
task. A 64K TSO would be one pseudo header and the rest payload. Without
TSO you would add ~40 more headers. That's the savings across the bus.
> Is there any other bottleneck in the system that prevents
> us to see the 300% improvement? (i am assuming the card can
> do tso at wire speed)
My numbers are against PCI 64/66Mhz, so that's limiting. You're not going
to get much more that 940Mbps at 1GbE unidirectional. That's why all of the
savings at unidirectional Tx are in CPU reduction.
-scott
Hirokazu Takahashi wrote:
> P.S.
> Using "bswap" is little bit tricky.
>
bswap was added with the 80486 - 80386 do not have that instruction,
perhaps it's missing in some embedded system cpus, too. Is is possible
to avoid it?
--
Manfred
From: [email protected]
Date: Tue, 3 Sep 2002 17:22:37 +0400 (MSD)
Dave, look, he says we will oops when sendfiling the last byte of a page,
and will have to call skb_checksum().
It is true. But his patch must be rewritten, bswap is forbidden
on older processors.
Better fix is to verify len >=2 before half-word alignment
test at the beginning of csum_partial. I am not enough of
an x86 coder to hack this up reliably. :-)
From: "David S. Miller" <[email protected]>
Date: Tue, 03 Sep 2002 14:05:55 -0700 (PDT)
Better fix is to verify len >=2 before half-word alignment
test at the beginning of csum_partial. I am not enough of
an x86 coder to hack this up reliably. :-)
Further inspection shows that PII/PPRO csum_partial variant requires
even more surgery and is even more outside my realm of x86 asm
capability :-)
Hello,
> Hirokazu Takahashi wrote:
> > P.S.
> > Using "bswap" is little bit tricky.
> >
>
> bswap was added with the 80486 - 80386 do not have that instruction,
> perhaps it's missing in some embedded system cpus, too. Is is possible
> to avoid it?
There are two kinds of csum_partial() for x86.
I just added bswap to PII/PPro csum_partial()
Followup to: <[email protected]>
By author: Hirokazu Takahashi <[email protected]>
In newsgroup: linux.dev.kernel
>
> P.S.
> Using "bswap" is little bit tricky.
>
It needs to be protected by CONFIG_I486 and alternate code implemented
for i386 (xchg %al,%ah; rol $16,%eax, xchg %al,%ah for example.)
-hpa
--
<[email protected]> at work, <[email protected]> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt <[email protected]>
From: "H. Peter Anvin" <[email protected]>
Date: 3 Sep 2002 18:02:17 -0700
Followup to: <[email protected]>
By author: Hirokazu Takahashi <[email protected]>
In newsgroup: linux.dev.kernel
>
> P.S.
> Using "bswap" is little bit tricky.
>
It needs to be protected by CONFIG_I486 and alternate code implemented
for i386 (xchg %al,%ah; rol $16,%eax, xchg %al,%ah for example.)
He only used bswap in the P-II/PPRO csum_partial, which is
ifdef protected.
On 3 Sep 2002, H. Peter Anvin wrote:
[Sorry HPA, I forgot to cc to linux-kernel the first time.]
> > P.S.
> > Using "bswap" is little bit tricky.
> >
>
> It needs to be protected by CONFIG_I486 and alternate code implemented
> for i386 (xchg %al,%ah; rol $16,%eax, xchg %al,%ah for example.)
While it would work, this sequence is overkill. Unless I'm mistaken, the
only property of bswap which is used in this case is that it swaps even
and odd bytes, which can be done by a simple "roll $8,%eax" (or rorl).
I believe that bswap is one byte shorter than roll. In any case, using a
rotate might be the right thing to do on other architectures.
Gabriel.
Gabriel Paubert wrote:
> On 3 Sep 2002, H. Peter Anvin wrote:
>
> [Sorry HPA, I forgot to cc to linux-kernel the first time.]
>
>
>>>P.S.
>>> Using "bswap" is little bit tricky.
>>>
>>
>>It needs to be protected by CONFIG_I486 and alternate code implemented
>>for i386 (xchg %al,%ah; rol $16,%eax, xchg %al,%ah for example.)
>
>
> While it would work, this sequence is overkill. Unless I'm mistaken, the
> only property of bswap which is used in this case is that it swaps even
> and odd bytes, which can be done by a simple "roll $8,%eax" (or rorl).
>
> I believe that bswap is one byte shorter than roll. In any case, using a
> rotate might be the right thing to do on other architectures.
>
And again, I think you'll find the rotate faster on at least some x86 cores.
-hpa
On Wed, 2002-09-04 at 23:39, Gabriel Paubert wrote:
> While it would work, this sequence is overkill. Unless I'm mistaken, the
> only property of bswap which is used in this case is that it swaps even
> and odd bytes, which can be done by a simple "roll $8,%eax" (or rorl).
bswap is a 32bit swap.
Alan Cox wrote:
> On Wed, 2002-09-04 at 23:39, Gabriel Paubert wrote:
> > While it would work, this sequence is overkill. Unless I'm mistaken, the
> > only property of bswap which is used in this case is that it swaps even
> > and odd bytes, which can be done by a simple "roll $8,%eax" (or rorl).
>
> bswap is a 32bit swap.
Yes it's different from the roll $8, but if all you need is to swap odd
and even bytes for the IP checksum, either instruction is fine.
-- Jamie
From: Hirokazu Takahashi <[email protected]>
Date: Thu, 05 Sep 2002 11:13:26 +0900 (JST)
> Better fix is to verify len >=2 before half-word alignment
> test at the beginning of csum_partial. I am not enough of
> an x86 coder to hack this up reliably. :-)
Don't care about the order of checking len and half-word alignment
as both of them have to be checked after all.
I speak of non-PII/PPRO csum_partial.
Franks a lot,
David S. Miller
[email protected]
Hello,
> > While it would work, this sequence is overkill. Unless I'm mistaken, the
> > only property of bswap which is used in this case is that it swaps even
> > and odd bytes, which can be done by a simple "roll $8,%eax" (or rorl).
> >
> > I believe that bswap is one byte shorter than roll. In any case, using a
> > rotate might be the right thing to do on other architectures.
> >
>
> And again, I think you'll find the rotate faster on at least some x86 cores.
Yeah, I replaced "bswap %eax" with "roll $8,%eax" which would be more
familier to us.
> Better fix is to verify len >=2 before half-word alignment
> test at the beginning of csum_partial. I am not enough of
> an x86 coder to hack this up reliably. :-)
Don't care about the order of checking len and half-word alignment
as both of them have to be checked after all.
Thank you,
Hirokazu Takahashi.
--- linux/arch/i386/lib/checksum.S.BUG Sun Sep 1 17:00:59 2030
+++ linux/arch/i386/lib/checksum.S Thu Sep 5 10:33:31 2030
@@ -126,8 +126,8 @@ csum_partial:
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
- testl $2, %esi
- jnz 30f
+ testl $3, %esi
+ jnz 25f
10:
movl %ecx, %edx
movl %ecx, %ebx
@@ -145,6 +145,20 @@ csum_partial:
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ roll $8, %eax
+ movzbl (%esi), %ebx
+ shll $8, %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
30: subl $2, %ecx
ja 20b
@@ -211,6 +225,10 @@ csum_partial:
addl %ebx,%eax
adcl $0,%eax
80:
+ testl $1, 12(%esp)
+ jz 90f
+ roll $8, %eax
+90:
popl %ebx
popl %esi
ret
On Thu, 5 Sep 2002, Hirokazu Takahashi wrote:
> > And again, I think you'll find the rotate faster on at least some x86 cores.
>
> Yeah, I replaced "bswap %eax" with "roll $8,%eax" which would be more
> familier to us.
That's up to you. Since the bswap or roll are only in the conditional,
hopefully infrequently used paths of an odd buffer address, I don't
believe that selecting one or the other has any measurable impact.
> +25:
> + testl $1, %esi
> + jz 30f
> + # buf is odd
> + dec %ecx
> + jl 90f
> + roll $8, %eax
> + movzbl (%esi), %ebx
> + shll $8, %ebx
> + addl %ebx, %eax
> + adcl $0, %eax
> + inc %esi
> + testl $2, %esi
> + jz 10b
Now that is grossly inefficient ;-) since you can save one instruction by
moving roll after adcl (hand edited partial patch hunk, won't apply):
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ movzbl (%esi), %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ inc %esi
+ roll $8, %eax
+ testl $2, %esi
+ jz 10b
Gabriel.
Gabriel Paubert wrote:
> Now that is grossly inefficient ;-) since you can save one instruction by
> moving roll after adcl (hand edited partial patch hunk, won't apply):
Yes but is it _faster_? :-)
I've been doing some PPro assembly lately, and I'm reminded that
sometimes inserting instructions can reduce the timing by up to 8 cycles
or so.
-- Jamie
From: Gabriel Paubert <[email protected]>
Date: Thu, 5 Sep 2002 15:21:01 +0200 (CEST)
it is in the out of mainline code path for odd buffer addresses
This happens to occur every packet for pppoe users BTW.
On Thu, 5 Sep 2002, Jamie Lokier wrote:
> Gabriel Paubert wrote:
> > Now that is grossly inefficient ;-) since you can save one instruction by
> > moving roll after adcl (hand edited partial patch hunk, won't apply):
>
> Yes but is it _faster_? :-)
Hard to tell, with OOO engine and decoder constraints. But once again it
is in the out of mainline code path for odd buffer addresses, not in the
loop, so its performance is not critical. Actually code size may have more
impact it ends up spanning one more cache line (or even a 16 byte block
used as fetch unit by P6 cores).
>
> I've been doing some PPro assembly lately, and I'm reminded that
> sometimes inserting instructions can reduce the timing by up to 8 cycles
> or so.
The one instruction that you can still be moved around easily is the
pointer increment. But I would never try to improve code paths that I
consider non critical.
Gabriel.
Hirokazu Takahashi wrote:
>
> I wish recent x86 processor can reorder instructions in it.
>
They can.
-hpa
Hello,
I updated the csum_partial() for x86.
csum_partial() for standard x86 can also handle odd buffer better.
> > > Now that is grossly inefficient ;-) since you can save one instruction by
> > > moving roll after adcl (hand edited partial patch hunk, won't apply):
I applied it.
But it will be trivial for its performance on most packets.
> > I've been doing some PPro assembly lately, and I'm reminded that
> > sometimes inserting instructions can reduce the timing by up to 8 cycles
> > or so.
>
> The one instruction that you can still be moved around easily is the
> pointer increment. But I would never try to improve code paths that I
> consider non critical.
I wish recent x86 processor can reorder instructions in it.
--- linux/arch/i386/lib/checksum.S.BUG Sun Sep 1 17:00:59 2030
+++ linux/arch/i386/lib/checksum.S Fri Sep 6 16:19:27 2030
@@ -55,8 +55,21 @@ csum_partial:
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
- testl $2, %esi # Check alignment.
+ testl $3, %esi # Check alignment.
jz 2f # Jump if alignment is ok.
+ testl $1, %esi # Check alignment.
+ jz 10f # Jump if alignment is boundary of 2bytes.
+
+ # buf is odd
+ dec %ecx
+ jl 8f
+ movzbl (%esi), %ebx
+ adcl %ebx, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 2f
+10:
subl $2, %ecx # Alignment uses up two bytes.
jae 1f # Jump if we had at least two bytes.
addl $2, %ecx # ecx was < 2. Deal with it.
@@ -111,6 +124,10 @@ csum_partial:
6: addl %ecx,%eax
adcl $0, %eax
7:
+ testl $1, 12(%esp)
+ jz 8f
+ roll $8, %eax
+8:
popl %ebx
popl %esi
ret
@@ -126,8 +143,8 @@ csum_partial:
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
- testl $2, %esi
- jnz 30f
+ testl $3, %esi
+ jnz 25f
10:
movl %ecx, %edx
movl %ecx, %ebx
@@ -145,6 +162,19 @@ csum_partial:
lea 2(%esi), %esi
adcl $0, %eax
jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ movzbl (%esi), %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
30: subl $2, %ecx
ja 20b
@@ -211,6 +241,10 @@ csum_partial:
addl %ebx,%eax
adcl $0,%eax
80:
+ testl $1, 12(%esp)
+ jz 90f
+ roll $8, %eax
+90:
popl %ebx
popl %esi
ret