Hi Scott,
Peculiarities in the PCI bridges on Power4 based ppc64 machines mean
that unaligned DMAs are horribly slow. This hits us hard on gigabit
transfers, since the packets (starting from the MAC header) are
usually only 2-byte aligned.
The patch below addresses this by copying and realigning packets into
nicely 2k aligned buffers. As well as fixing the alignment that
minimises the number of TCE lookups, and because we allocate the
buffers pci_alloc_consistent(), we avoid setting up and tearing down
the TCE mappings for each packet.
It's definitely a ppc64 specific hack, but I've tried to minimise the
patch's adverse impact on the rest of the code: It should cause no
change in behaviour or performance when the realignment is disabled,
which is true by default on everything except ppc64. I've also
attempted to minimise the impact on the readability of the rest of the
code.
It would be very helpful if you could include this patch in the
mainline driver.
diff -urN /scratch/anton/export/drivers/net/e1000/e1000.h linux-congo/drivers/net/e1000/e1000.h
--- /scratch/anton/export/drivers/net/e1000/e1000.h 2003-05-30 01:28:06.000000000 +1000
+++ linux-congo/drivers/net/e1000/e1000.h 2003-06-12 12:39:10.000000000 +1000
@@ -163,6 +163,43 @@
#define E1000_TX_DESC(R, i) E1000_GET_DESC(R, i, e1000_tx_desc)
#define E1000_CONTEXT_DESC(R, i) E1000_GET_DESC(R, i, e1000_context_desc)
+#ifdef CONFIG_PPC64
+/* We have a POWER4 specific performance hack to deal with the
+ * slowness of transferring unaligned frames over the PCI bus */
+#define E1000_REALIGN_DATA_HACK 1
+#else
+#define E1000_REALIGN_DATA_HACK 0
+#endif
+
+#define E1000_REALIGN_BUFFER_SIZE 2048 /* importantly, >1514 */
+#define E1000_REALIGN_TARGET_ALIGNMENT E1000_REALIGN_BUFFER_SIZE
+
+#if E1000_REALIGN_DATA_HACK
+/* We want each buffer to lie within one page, to minimise TCE
+ * lookups */
+#if (PAGE_SIZE % E1000_REALIGN_BUFFER_SIZE)
+#warning E1000_REALIGN_BUFFER_SIZE is not a factor of page size.
+#endif
+
+struct e1000_realign_ring {
+ unsigned char *vaddr;
+ dma_addr_t dma_handle;
+ size_t size;
+};
+
+#define E1000_REALIGN_BUFFER_OFFSET(i) ((i)*E1000_REALIGN_BUFFER_SIZE)
+#define E1000_REALIGN_BUFFER(a,i) ((a)->realign_ring.vaddr ? \
+ (a)->realign_ring.vaddr + E1000_REALIGN_BUFFER_OFFSET(i) : \
+ NULL)
+#define E1000_REALIGN_BUFFER_DMA(a,i) ((a)->realign_ring.dma_handle \
+ + E1000_REALIGN_BUFFER_OFFSET(i))
+
+#else /* ! E1000_REALIGN_DATA_HACK */
+#define E1000_REALIGN_BUFFER(a,i) NULL
+#define E1000_REALIGN_BUFFER_DMA(a,i) 0
+#endif /* ! E1000_REALIGN_DATA_HACK */
+
+
/* board specific private data structure */
struct e1000_adapter {
@@ -186,6 +223,9 @@
/* TX */
struct e1000_desc_ring tx_ring;
+#if E1000_REALIGN_DATA_HACK
+ struct e1000_realign_ring realign_ring;
+#endif /* E1000_REALIGN_DATA_HACK */
uint32_t txd_cmd;
uint32_t tx_int_delay;
uint32_t tx_abs_int_delay;
diff -urN /scratch/anton/export/drivers/net/e1000/e1000_main.c linux-congo/drivers/net/e1000/e1000_main.c
--- /scratch/anton/export/drivers/net/e1000/e1000_main.c 2003-05-30 01:23:39.000000000 +1000
+++ linux-congo/drivers/net/e1000/e1000_main.c 2003-06-12 12:52:58.000000000 +1000
@@ -702,6 +702,23 @@
return 0;
}
+static inline void
+e1000_setup_realign_ring(struct e1000_adapter *adapter)
+{
+#if E1000_REALIGN_DATA_HACK
+ struct e1000_desc_ring *txdr = &adapter->tx_ring;
+ struct e1000_realign_ring *rr = &adapter->realign_ring;
+
+ rr->size = txdr->count * E1000_REALIGN_BUFFER_SIZE;
+ rr->vaddr = pci_alloc_consistent(adapter->pdev, rr->size,
+ &rr->dma_handle);
+
+ if (! rr->vaddr)
+ printk(KERN_WARNING "%s: could not allocate realignment buffers. Expect poor performance on Power4 hardware\n",
+ adapter->netdev->name);
+#endif /* E1000_REALIGN_DATA_HACK */
+}
+
/**
* e1000_setup_tx_resources - allocate Tx resources (Descriptors)
* @adapter: board private structure
@@ -735,6 +752,8 @@
}
memset(txdr->desc, 0, txdr->size);
+ e1000_setup_realign_ring(adapter);
+
txdr->next_to_use = 0;
txdr->next_to_clean = 0;
@@ -951,6 +970,19 @@
E1000_WRITE_REG(&adapter->hw, RCTL, rctl);
}
+static inline void
+e1000_free_realign_ring(struct e1000_adapter *adapter)
+{
+#if E1000_REALIGN_DATA_HACK
+ struct e1000_realign_ring *rr = &adapter->realign_ring;
+
+ if (rr->vaddr)
+ pci_free_consistent(adapter->pdev, rr->size,
+ rr->vaddr, rr->dma_handle);
+ rr->vaddr = NULL;
+#endif /* E1000_REALIGN_DATA_HACK */
+}
+
/**
* e1000_free_tx_resources - Free Tx Resources
* @adapter: board private structure
@@ -972,6 +1004,8 @@
adapter->tx_ring.desc, adapter->tx_ring.dma);
adapter->tx_ring.desc = NULL;
+
+ e1000_free_realign_ring(adapter);
}
/**
@@ -990,11 +1024,11 @@
for(i = 0; i < adapter->tx_ring.count; i++) {
if(adapter->tx_ring.buffer_info[i].skb) {
-
- pci_unmap_page(pdev,
- adapter->tx_ring.buffer_info[i].dma,
- adapter->tx_ring.buffer_info[i].length,
- PCI_DMA_TODEVICE);
+ if(adapter->tx_ring.buffer_info[i].dma)
+ pci_unmap_page(pdev,
+ adapter->tx_ring.buffer_info[i].dma,
+ adapter->tx_ring.buffer_info[i].length,
+ PCI_DMA_TODEVICE);
dev_kfree_skb(adapter->tx_ring.buffer_info[i].skb);
@@ -1491,6 +1525,21 @@
#define E1000_MAX_DATA_PER_TXD (1<<E1000_MAX_TXD_PWR)
static inline int
+e1000_realign_data(struct e1000_adapter *adapter, unsigned char *data,
+ int size, int i)
+{
+ unsigned char *buf = E1000_REALIGN_BUFFER(adapter, i);
+
+ if(buf && (size <= E1000_REALIGN_BUFFER_SIZE)
+ && ((unsigned long)(data) % E1000_REALIGN_TARGET_ALIGNMENT)) {
+
+ memcpy(buf, data, size);
+ return 1;
+ }
+ return 0;
+}
+
+static inline int
e1000_tx_map(struct e1000_adapter *adapter, struct sk_buff *skb,
unsigned int first)
{
@@ -1515,11 +1564,13 @@
size -= 4;
#endif
tx_ring->buffer_info[i].length = size;
- tx_ring->buffer_info[i].dma =
- pci_map_single(adapter->pdev,
- skb->data + offset,
- size,
- PCI_DMA_TODEVICE);
+ if (e1000_realign_data(adapter, skb->data+offset, size, i))
+ tx_ring->buffer_info[i].dma = 0;
+ else
+ tx_ring->buffer_info[i].dma =
+ pci_map_single(adapter->pdev,
+ skb->data + offset, size,
+ PCI_DMA_TODEVICE);
tx_ring->buffer_info[i].time_stamp = jiffies;
len -= size;
@@ -1593,7 +1644,13 @@
while(count--) {
tx_desc = E1000_TX_DESC(*tx_ring, i);
- tx_desc->buffer_addr = cpu_to_le64(tx_ring->buffer_info[i].dma);
+ if (E1000_REALIGN_DATA_HACK
+ && !tx_ring->buffer_info[i].dma)
+ tx_desc->buffer_addr =
+ cpu_to_le64(E1000_REALIGN_BUFFER_DMA(adapter, i));
+ else
+ tx_desc->buffer_addr =
+ cpu_to_le64(tx_ring->buffer_info[i].dma);
tx_desc->lower.data =
cpu_to_le32(txd_lower | tx_ring->buffer_info[i].length);
tx_desc->upper.data = cpu_to_le32(txd_upper);
--
David Gibson | For every complex problem there is a
[email protected] | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson
David, arch-specific code in the driver concerns me. I would really
like to avoid any arch-specific code in the driver if at all possible.
Can we find a way to move this work to the arch-dependent areas? This
doesn't seem to be an issue unique to e1000, so moving this up one level
should benefit other devices as well. More questions below.
> Peculiarities in the PCI bridges on Power4 based ppc64 machines mean
> that unaligned DMAs are horribly slow. This hits us hard on gigabit
> transfers, since the packets (starting from the MAC header) are
> usually only 2-byte aligned.
2-byte alignment is bad for ppc64, so what is minimum alignment that is
good? (I hope it's not 2K!) What could you do at a higher level to
present properly aligned buffers to the driver?
> The patch below addresses this by copying and realigning packets into
> nicely 2k aligned buffers. As well as fixing the alignment that
> minimises the number of TCE lookups, and because we allocate the
> buffers pci_alloc_consistent(), we avoid setting up and tearing down
> the TCE mappings for each packet.
If I'm understanding the patch correctly, you're saying unaligned DMA +
TCE lookup is more expensive than a data copy? If we copy the data, we
loss some of the benefits of TSO and Zerocopy and h/w checksum
offloading! What is more expensive, unaligned DMA or TCE?
-scott
Scott, we don't copy large blocks like large send.
The code is just suppose to copy any packets 2K or less.
Actually, David jumped the gun here a bit as we were going to write up a
more proposal for this.
I expect that this would help other acrh. as well.
The issue is the alignment and problems that causes on the PCI bus.
In a nutshell, it takes several more DMA's to get the transfer done. Not
sure if this would hold true on all platforms (IE is a function of the
adapter and or PCI bus) or if it is due to the bridge chips (PHB's) on
pSeries (PPC).
I think the only issue here that is PPC specific is that we have several
bridge chips in the IO path so LATENCY is longer that if you have a direct
attached bus. However, many platforms that need more PCI slots have to add
bridge chips into the path (required by the PCI architecture due to loads
on the bus, etc). Thus, the extra DMA's required due to miss-alignment
lead to longer bus times if bridge chips are in the IO path.
I copied below a note from 3/16/2003 that shows the issues. Note that the
main issue here is on PCI bus (not PCI-X).
The reason is that PCI-X with MMRBC =2048 reads larger chunks and this
helps. However, we have PIC-X bus traces and this alignment also helps
PCI-X as well, just to a lesser extend with respect to performance.
I don't think the TCE is an issue in this one case because the driver was
already DMA'ing a single "linear" buffer to the adapter.
It was just aligned in such a way that it takes more DMA's to finish.
In AIX, we did the copy to get to a single DMA buffer (which Linux already
does) as it has the TCP/IP headers in one buffer and data in another buffer
(one or two in some cases for MTU 1500). Thus we took two TCE lookups in
the AIX case so the copy saved us more.
Because we only copy 2K or less, the cost is minimal as the data is already
in the cache (typically) from being copied into the kernel (non-zero copy
case).
So in summary I think this is mainly an alignment issue and it is likely to
help other platforms as well so should be considered for the driver.
Here is the older note on the PCI bus trace. The receive alignment was
addressed in a different defect and I think has been taken care of.
To: Ricardo C Gonzalez/Austin/IBM@ibmus, Nancy J
Milliner/Austin/IBM@IBMUS, Anton Blanchard/Australia/IBM@IBMAU, Michael
Perez/Austin/IBM@IBMUS, Pat Buckland/Austin/IBM@IBMUS
cc: Brian Twichell/Austin/IBM@IBMUS, Bill Buros/Austin/IBM@IBMUS, Binh
Hua/Austin/IBM@IBMUS, Jim Cunningham/Austin/IBM@IBMUS, Brian M
Rzycki/Austin/IBM@IBMUS
From: Herman Dierks/Austin/IBM@IBMUS
Subject: Goliad performance issues on linux, results of PCI bus traces
This note is partially regarding bug1589. Will have to add some of this
info into that defect.
Any time you go looking for problems, you often find more than you went
looking for. Such is probably the case here so we may end up with a few
other defects if we want to track them seperately. We just looked at Linux
2.4, so I assume the same issues are in 2.5 but would need to be verified.
I have talked with Anton about issues 1 and 2 below so he will see about
fixing them so we can prototype the change and measure it.
Binh Hua came over and we ran a number of PCI bus traces using Brian &
Jim's bus Analizer over several days. (Thanks guys).
As this is just a "PCI" analizer (and not a PCI-X analizer), we could only
trace on the p660 (M2+) system.
After first day of tracing, we saw all 256 byte transfers on the bus for
Memory Read Multiple commands (MMR).
I thought those should be 512 and investigated the system FW level.
I found the FW on the machine was quite old (from 2001) and predated some
Goliad perf changes.
So I installed a 020910 level FW on the M2+.
The performance on AIX then jumped up 200 Mbits/sec. The PCI bus analizer
trace then showed we are doing 512 byte transfers as we should be.
On AIX, this took the PCI commands from 10 down to 3 to send one packet.
These are not CPU cycles but DMA bus cycles.
Thus you can see the large affect that saving a few commands can have on
performance.
This above paragraph is just a side note, and I will not put that into the
defect.
The point of this paragraph is that the FW sets up the bridge chips based
on adapter and its very important to have the right level of FW on the
system when looking at any performance issues.
Now that the M2+ had the right FW, we did a few more sniff tests and took
more bus traces on both AIX and then the following day on Linux.
Cutting to the chase, based on what we see in the traces, we see a couple
of performance problems.
1) Linux is not aligning the transmit buffer on a cache line.
M2+ to LER performance for LINUX is 777 Mbits while for AIX it is
859 Mbits. We have large send (TSO) disabled on AIX to make this apples to
apples.
So there is a 82 Mbit difference (10%) here that we feel is due to
the cache alignment. See details below.
With large send enabled, AIX tets 943 Mbits with single session.
The solution here is to align the transmit buffer onto a 128 byte
cache line. This is probably being controled up in the TCP stack above the
driver.
If we have to fix this in the driver, it would mean another
copy.
2) Linux is not aligning the receive buffer on a cache line.
On a number of PCI commands viewpoint, Linux alignment is fewer PCI
commands.
However, the bad new of the way it does the alignment is that the
memory controller will have to do "read-modify-write" operations to RAM to
update partial cache lines. That is very costly on the memory subsystem
and will hurt is as we scale up the number of adapters and the memory
bandwidth becomes taxed.
This may need to be a new defect on just the e1000 driver (at least for PPC
and PPC64).
3) We need to do a second look at the number of TX and RX descriptors
fetched by the adapter on Linux. These values may be set up way too large.
Binh thought on AIX we just move a cache line's worth at a time.
This is a secondary issue (generates bus traffic) and we need to
investigate a bit more before we open a defect on this.
DETAILS:
I will show the PCI commands for sending (reading from memory) for AIX
first and then for Linux 2.4. This is with correct level of FW on M2+.
PCI Commands are : MRM is Memory Read Multiple (IE multiple cache
lines)
MRL is Memory Read Line (IE one cache line of 128 bytes)
MR is Memory Read (IE reading fewer bytes than one cache
line)
AIX sending one 1514 byte packet (859 Mbits sec rate)
Address PCI comand Bytes xfered
4800 MRM 512
4A00 MRM 512
4C00 MRM 490
-------
1514 bytes
Linux sending one 1514 byte packet (777 Mbits sec rate)
Address PCI command Bytes xfered
420A8 MRM 344
42200 MRM 512
42400 MRM 512
42600 MRL 128
42680 MR 18
-----
1514 bytes , total time of 14.108 micro
seconds
So, notice what happended. On AIX we move the data in 3 PCI commands.
On Linux, due to the alignment, it took 5 operations.
The bus latency on these RIO based IO drawer machines is very long. Each of
these commands takes at least 700 nanosec to issue the command.
It can take longer to issue the command than to transfer the bytes.
So, we want to change Linux to align the buffer on a 128 byte cache
boundry. That should get us parity with AIX.
Now lets look at receive side (writes to memory)
AIX receiving one 1514 byte packet
Address PCI command Bytes xfered
EE800 MWI 512
EA00 MWI 512
EC00 MWI 384
ED80 MW 106 (partial cache line)
-----
1514 bytes,
LINUX receiving one 1514 byte packet
Address PCI Command Bytes xfered
2012 MWI 494 bytes (partial cache line)
2200 MWI 512
2400 MWI 508 (partial cache line)
----------
1514 bytes, total time 6.496 micro sec
Linux does the xfer in 3 PCI commands. However, it appears that due to the
partial cache lines (one at start and one at end) that this will cause one
more read-modify-write of a cache line at the memory controller to update
the line.
We need to verify this with Pat or Mike (so I am copying them on this note)
If so, then we should probably align the RX buffer to start on a cache
line and take the hit on one more PCI command. Just depends on what we
want to optimize for, esp. on a busy server.
"Feldman, Scott" <[email protected]> on 06/12/2003 08:16:11 PM
To: "David Gibson" <[email protected]>
cc: <[email protected]>, "Anton Blanchard" <[email protected]>,
Nancy J Milliner/Austin/IBM@IBMUS, Herman Dierks/Austin/IBM@IBMUS,
Ricardo C Gonzalez/Austin/IBM@ibmus
Subject: RE: e1000 performance hack for ppc64 (Power4)
David, arch-specific code in the driver concerns me. I would really
like to avoid any arch-specific code in the driver if at all possible.
Can we find a way to move this work to the arch-dependent areas? This
doesn't seem to be an issue unique to e1000, so moving this up one level
should benefit other devices as well. More questions below.
> Peculiarities in the PCI bridges on Power4 based ppc64 machines mean
> that unaligned DMAs are horribly slow. This hits us hard on gigabit
> transfers, since the packets (starting from the MAC header) are
> usually only 2-byte aligned.
2-byte alignment is bad for ppc64, so what is minimum alignment that is
good? (I hope it's not 2K!) What could you do at a higher level to
present properly aligned buffers to the driver?
> The patch below addresses this by copying and realigning packets into
> nicely 2k aligned buffers. As well as fixing the alignment that
> minimises the number of TCE lookups, and because we allocate the
> buffers pci_alloc_consistent(), we avoid setting up and tearing down
> the TCE mappings for each packet.
If I'm understanding the patch correctly, you're saying unaligned DMA +
TCE lookup is more expensive than a data copy? If we copy the data, we
loss some of the benefits of TSO and Zerocopy and h/w checksum
offloading! What is more expensive, unaligned DMA or TCE?
-scott
Too long to quote:
http://marc.theaimsgroup.com/?t=105538879600001&r=1&w=2
Wouldn't you get most of the benefit from copying that stuff around in
the driver if you allocated the skb->data aligned in the first place?
There's already code to align them on CPU cache boundaries:
#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES - 1)) & \
~(SMP_CACHE_BYTES - 1))
So, do something like this:
#ifdef ARCH_ALIGN_SKB_BYTES
#define SKB_ALIGN_BYTES ARCH_ALIGN_SKB_BYTES
#else
#define SKB_ALIGN_BYTES SMP_CACHE_BYTES
#endif
#define SKB_DATA_ALIGN(X) (((X) + (ARCH_ALIGN_SKB - 1)) & \
~(SKB_ALIGN_BYTES - 1))
You could easily make this adaptive to no align on th arch size when the
request is bigger than that, just like in the e1000 patch you posted.
--
Dave Hansen
[email protected]
I will let Anton respond to this. I think he may have tried this some
time back in his early prototypes to fix this.
I think the problem was not where the buffer started but where the packet
ended up within the buffer.
Due to varying sizes of TCP and IP headers the packet ended up at some
non-cache aligned address.
What we need for the DMA to work well is to have the final packet (with
datalink headers) starting on a cache line as its the final packet that
must be DMA'd. In fact it may need to to be aligned to a higher level than
that (not sure).
[email protected] on 06/13/2003 11:21:03 AM
To: Herman Dierks/Austin/IBM@IBMUS
cc: "Feldman, Scott" <[email protected]>, David Gibson
<[email protected]>, Linux Kernel Mailing List
<[email protected]>, Anton Blanchard <[email protected]>,
Nancy J Milliner/Austin/IBM@IBMUS, Ricardo C
Gonzalez/Austin/IBM@ibmus, Brian Twichell/Austin/IBM@IBMUS,
[email protected]
Subject: RE: e1000 performance hack for ppc64 (Power4)
Too long to quote:
http://marc.theaimsgroup.com/?t=105538879600001&r=1&w=2
Wouldn't you get most of the benefit from copying that stuff around in
the driver if you allocated the skb->data aligned in the first place?
There's already code to align them on CPU cache boundaries:
#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES - 1)) & \
~(SMP_CACHE_BYTES - 1))
So, do something like this:
#ifdef ARCH_ALIGN_SKB_BYTES
#define SKB_ALIGN_BYTES ARCH_ALIGN_SKB_BYTES
#else
#define SKB_ALIGN_BYTES SMP_CACHE_BYTES
#endif
#define SKB_DATA_ALIGN(X) (((X) + (ARCH_ALIGN_SKB - 1)) & \
~(SKB_ALIGN_BYTES - 1))
You could easily make this adaptive to no align on th arch size when the
request is bigger than that, just like in the e1000 patch you posted.
--
Dave Hansen
[email protected]
> So in summary I think this is mainly an alignment issue and
> it is likely to help other platforms as well so should be
> considered for the driver.
But we're applying the alignment constraint in the wrong direction. The
e1000 h/w doesn't have an alignment constraint, it's your arch. Dave
Hansen's suggestion is in the right direction; let's see what Anton's
response is.
-scott
> Wouldn't you get most of the benefit from copying that stuff around in
> the driver if you allocated the skb->data aligned in the first place?
Nice try, but my understanding is that on the transmit path we reserve
the maximum sized TCP header, copy the data in then form our TCP header
backwards from that point. Since the TCP header size changes with
various options, its not an easy task.
One thing I thought of doing was to cache the current TCP header size
and align the next packet based on it, with an extra cacheline at the
start for it to spill into if the TCP header grew.
This is only worth it if most packets will have the same sized header.
Networking guys: is this a valid assumption?
Anton
From: Anton Blanchard <[email protected]>
Date: Sat, 14 Jun 2003 08:38:41 +1000
This is only worth it if most packets will have the same sized header.
Networking guys: is this a valid assumption?
Not really... one retransmit and the TCP header size grows
due to the SACK options.
I find it truly bletcherous what you're trying to do here.
Why not instead find out if it's possible to have the e1000
fetch the entire cache line where the first byte of the packet
resides? Even ancient designes like SunHME do that.
> 2-byte alignment is bad for ppc64, so what is minimum alignment that is
> good? (I hope it's not 2K!) What could you do at a higher level to
> present properly aligned buffers to the driver?
Best case 128 bytes - ppc64 cacheline size. Worst case 512 bytes - our
pci-pci bridge likes to prefetch in 512 byte increments. From Herman's
data you can see this in action:
Linux sending one 1514 byte packet (777 Mbits sec rate)
Address PCI command Bytes xfered
420A8 MRM 344
42200 MRM 512
42400 MRM 512
42600 MRL 128
42680 MR 18
With the buffer 512 byte aligned this is completed in 3 transactions.
Yes the hardware could be more intelligent about these unaligned
transactions but we cant do much about that now.
We might be able to do the alignment at a higher level but its not
straightforward (see my previous mail).
> If I'm understanding the patch correctly, you're saying unaligned DMA +
> TCE lookup is more expensive than a data copy? If we copy the data, we
> loss some of the benefits of TSO and Zerocopy and h/w checksum
> offloading! What is more expensive, unaligned DMA or TCE?
Lets ignore TCE lookup overhead for the moment. As Herman pointed all
these DMAs should occur on the same page.
Anton
> Not really... one retransmit and the TCP header size grows
> due to the SACK options.
OK scratch that idea.
> I find it truly bletcherous what you're trying to do here.
I think so too, but its hard to ignore ~100Mbit/sec in performance.
> Why not instead find out if it's possible to have the e1000
> fetch the entire cache line where the first byte of the packet
> resides? Even ancient designes like SunHME do that.
Rusty and I were wondering why the e1000 didnt do that exact thing.
Scott: is it possible to enable such a thing?
Anton
> > Why not instead find out if it's possible to have the e1000
> > fetch the entire cache line where the first byte of the
> > packet resides? Even ancient designes like SunHME do that.
>
> Rusty and I were wondering why the e1000 didnt do that exact thing.
>
> Scott: is it possible to enable such a thing?
I thought the answer was no, so I double checked with a couple of
hardware guys, and the answer is still no.
-scott
From: "Feldman, Scott" <[email protected]>
Date: Fri, 13 Jun 2003 16:52:18 -0700
> > Why not instead find out if it's possible to have the e1000
> > fetch the entire cache line where the first byte of the
> > packet resides? Even ancient designes like SunHME do that.
>
> Rusty and I were wondering why the e1000 didnt do that exact thing.
>
> Scott: is it possible to enable such a thing?
I thought the answer was no, so I double checked with a couple of
hardware guys, and the answer is still no.
Sigh...
So Anton, when the PCI controller gets a set of sub-cacheline word
reads from the device, it reads the value from memory once for every
one of those words? ROFL, if so... I can't believe they wouldn't
put caches on the PCI controller for this, at least a one-behind that
snoops the bus :(
> I thought the answer was no, so I double checked with a couple of
> hardware guys, and the answer is still no.
Hi Scott,
Thats a pity, the e100 docs on sourceforge show it can do what we want,
it would be nice if e1000 had this feature too :)
4.2.2 Read Align
The Read Align feature is aimed to enhance performance in cache line
oriented systems. Starting a PCI transaction in these systems on a
non-cache line aligned address may result in low performance. To solve
this performance problem, the controller can be configured to terminate
Transmit DMA cycles on a cache line boundary, and start the next
transaction on a cache line aligned address. This feature is enabled
when the Read Align Enable bit is set in device Configure command
(Section 6.4.2.3, "Configure (010b)").
If this bit is set, the device operates as follows:
* When the device is close to running out of resources on the Transmit
* DMA (in other words, the Transmit FIFO is almost full), it attempts to
* terminate the read transaction on the nearest cache line boundary when
* possible.
* When the arbitration counters feature is enabled (maximum Transmit DMA
* byte count value is set in configuration space), the device switches
* to other pending DMAs on cache line boundary only.
> So Anton, when the PCI controller gets a set of sub-cacheline word
> reads from the device, it reads the value from memory once for every
> one of those words? ROFL, if so... I can't believe they wouldn't
> put caches on the PCI controller for this, at least a one-behind that
> snoops the bus :(
There is a cache in the host bridge and the PCI-PCI bridge. I dont
think we go back to memory for sub cacheline reads.
What I think is happening is that we arent tripping the prefetch logic.
We should take a latency hit for only the first cacheline at which point
the host bridge decides to start prefetching for us. If not then we take
take the latency hit on each transaction.
Anton
From: Anton Blanchard <[email protected]>
Date: Sat, 14 Jun 2003 10:55:34 +1000
What I think is happening is that we arent tripping the prefetch
logic. We should take a latency hit for only the first cacheline
at which point the host bridge decides to start prefetching for
us. If not then we take take the latency hit on each transaction.
It sounds like what happens is that the sub-cacheline word reads
don't trigger the prefetch, but the first PCI read multiple
transaction does.
David S. Miller wrote:
> From: Anton Blanchard <[email protected]>
> Date: Sat, 14 Jun 2003 08:38:41 +1000
>
> This is only worth it if most packets will have the same sized header.
> Networking guys: is this a valid assumption?
>
> Not really... one retransmit and the TCP header size grows
> due to the SACK options.
Yep, but it really doesn't have too many options (sic pun ;))..
i.e. The max the options can add are 40 bytes, speaking
strictly TCP, not IP. This really should fit into one extra
cacheline for most architectures, at most, right?
[The TCP options have to end and the data start on a 32
bit boundary. For established connections, we're
principally talking SACK options and v. likely timestamp.
(Ignoring those egregious benchmark guys who turn everything
useful off ;)). SYNs wont have data in any case.
So its going to grow by (SACK = 8*n + 2)+ (TS = 10) bytes,
with n = number of sack options, with a max of n = 3
if timestamps are enabled. Adding that to the standard
length of 20 bytes, the total len of a TCP header is thus
very likely one of:
20 + [ 0 | 20 |32 | 36] bytes
= 20 | 40 | 52 | 56 bytes.
If cachelines were 64 bytes, we wouldnt be wasting a
whole lot of space if we aligned data start or some
other scheme as was suggested. Even given the larger
cachelines, it might be worth it, or is this totally
not an option (cough,sic ;))?
> I find it truly bletcherous what you're trying to do here.
yep
thanks,
Nivedita
From: Nivedita Singhvi <[email protected]>
Date: Fri, 13 Jun 2003 22:16:22 -0700
Yep, but it really doesn't have too many options (sic pun ;))..
i.e. The max the options can add are 40 bytes, speaking
strictly TCP, not IP. This really should fit into one extra
cacheline for most architectures, at most, right?
It's what the bottom of the header is aligned to, but
we build the packet top to bottom not the other way around.
At 09:18 AM 14/06/2003 +1000, Anton Blanchard wrote:
> > Not really... one retransmit and the TCP header size grows
> > due to the SACK options.
>
>OK scratch that idea.
why not have a performance option that is a tradeoff between optimum
payload size versus efficiency.
unless i misunderstand the problem, you can certainly pad the TCP options
with NOPs ...
> > I find it truly bletcherous what you're trying to do here.
>
>I think so too, but its hard to ignore ~100Mbit/sec in performance.
another option is for the write() path is for instantant-send TCP sockets
to delay the copy_from_user() until the IP+TCP header size is known.
i wouldn't expect the net folks to like that, however ..
cheers,
lincoln.
From: Lincoln Dale <[email protected]>
Date: Sat, 14 Jun 2003 11:52:53 +1000
unless i misunderstand the problem, you can certainly pad the TCP
options with NOPs ...
You may not mangle packet if it is not your's alone.
And every TCP packet is shared with TCP retransmit
queue and therefore would need to be copied before
being mangled.
At 10:41 PM 13/06/2003 -0700, David S. Miller wrote:
> From: Lincoln Dale <[email protected]>
> Date: Sat, 14 Jun 2003 11:52:53 +1000
>
> unless i misunderstand the problem, you can certainly pad the TCP
> options with NOPs ...
>
>You may not mangle packet if it is not your's alone.
>
>And every TCP packet is shared with TCP retransmit
>queue and therefore would need to be copied before
>being mangled.
ok, so lets take this a step further.
can we have the TCP retransmit side take a performance hit if it needs to
realign buffers?
once again, for a "high performance app" requiring gigabit-type speeds, its
probably fair to say that this is mostly in the realm of applications on a
LAN rather than across a WAN or internet.
on a switched LAN, i'd expect TCP retransmissions to be far fewer ...
cheers,
lincoln.
From: Lincoln Dale <[email protected]>
Date: Sat, 14 Jun 2003 15:52:35 +1000
can we have the TCP retransmit side take a performance hit if it needs to
realign buffers?
You don't understand, the person who mangles the packet
must make the copy, not the person not doing the packet
modifications.
for a "high performance app" requiring gigabit-type speeds,
...we probably won't be using ppc64 and e1000 cards, yes, I agree
:-)
Anton, go to the local computer store and pick up some tg3
cards or a bunch of Taiwan specials :-)
Folks, can we remove whatever member of this CC: list creates
bounces that say:
Your message to Linux_news awaits moderator approval
Ok? I can't guess which one it is because these all look
like normal people's email addresses (except possibly the
[email protected] thing, maybe that's the one)
On Fri, Jun 13, 2003 at 11:14:18PM -0700, David S. Miller wrote:
> Folks, can we remove whatever member of this CC: list creates
> bounces that say:
> Your message to Linux_news awaits moderator approval
> Ok? I can't guess which one it is because these all look
> like normal people's email addresses (except possibly the
> [email protected] thing, maybe that's the one)
That's legitimate, but I'm still trying to convince the BKL brigade
that he should have chosen something more eponymous e.g. dhansen. =)
-- wli
On Fri, Jun 13, 2003 at 11:14:18PM -0700, David S. Miller wrote:
>
> Folks, can we remove whatever member of this CC: list creates
> bounces that say:
>
> Your message to Linux_news awaits moderator approval
It's someone subscribed to [email protected] that causes
this.
I've complained to the admin of that mail-news gateway that is barfing
on too many CC: members in the past to not do this, but it doesn't seem
like they are listening...
greg k-h
Hi Greg.
>> Folks, can we remove whatever member of this CC: list creates
>> bounces that say:
>>
>> Your message to Linux_news awaits moderator approval
> It's someone subscribed to [email protected] that
> causes this.
>
> I've complained to the admin of that mail-news gateway that is
> barfing on too many CC: members in the past to not do this, but
> it doesn't seem like they are listening...
If true, why not just unsub the said gateway from L-K and refuse
to resub them until they clean up their act. That's what would
happen with anybody else, after all...
Best wishes from Riley.
---
* Nothing as pretty as a smile, nothing as ugly as a frown.
---
Outgoing mail is certified Virus Free.
Checked by AVG anti-virus system (http://www.grisoft.com).
Version: 6.0.489 / Virus Database: 288 - Release Date: 10-Jun-2003
On Sat, Jun 14, 2003 at 10:08:49AM -0700, Greg KH wrote:
> On Fri, Jun 13, 2003 at 11:14:18PM -0700, David S. Miller wrote:
> >
> > Folks, can we remove whatever member of this CC: list creates
> > bounces that say:
> >
> > Your message to Linux_news awaits moderator approval
>
> It's someone subscribed to [email protected] that causes
> this.
And the offender is: [email protected]
Have fun...
greg k-h
From: Greg KH <[email protected]>
Date: Sat, 14 Jun 2003 10:08:49 -0700
It's someone subscribed to [email protected] that causes
this.
Thanks, I've nuked [email protected], if they resubscribe
without contacting me, I'll block future subscription attempts from
them.
Anton, I think the option described below is intended to cause the adapter
to "get off on a cache line boundary" so when it restarts the DMA it will
be aligned. This is for cases when the adapter has to get off, for exampe
due to FIFO full, etc.
Some adapters would get off on any boundary and that then causes perf
issues when the DMA is restarted.
This is a good option, but I don't think it addresses what we need here as
the host needs to ensure a DMA starts on a cache line.
Different adapter anyway, but I am just pointing out that even if e1000
had this it would not be the solution.
Anton Blanchard <[email protected]> on 06/13/2003 07:03:42 PM
To: "Feldman, Scott" <[email protected]>
cc: "David S. Miller" <[email protected]>,
[email protected], Herman Dierks/Austin/IBM@IBMUS,
[email protected], [email protected], Nancy J
Milliner/Austin/IBM@IBMUS, Ricardo C Gonzalez/Austin/IBM@ibmus,
Brian Twichell/Austin/IBM@IBMUS, [email protected]
Subject: Re: e1000 performance hack for ppc64 (Power4)
> I thought the answer was no, so I double checked with a couple of
> hardware guys, and the answer is still no.
Hi Scott,
Thats a pity, the e100 docs on sourceforge show it can do what we want,
it would be nice if e1000 had this feature too :)
4.2.2 Read Align
The Read Align feature is aimed to enhance performance in cache line
oriented systems. Starting a PCI transaction in these systems on a
non-cache line aligned address may result in low performance. To solve
this performance problem, the controller can be configured to terminate
Transmit DMA cycles on a cache line boundary, and start the next
transaction on a cache line aligned address. This feature is enabled
when the Read Align Enable bit is set in device Configure command
(Section 6.4.2.3, "Configure (010b)").
If this bit is set, the device operates as follows:
* When the device is close to running out of resources on the Transmit
* DMA (in other words, the Transmit FIFO is almost full), it attempts to
* terminate the read transaction on the nearest cache line boundary when
* possible.
* When the arbitration counters feature is enabled (maximum Transmit DMA
* byte count value is set in configuration space), the device switches
* to other pending DMAs on cache line boundary only.
Look folks, we run 40 to 48 GigE adapters in a p690 32 way on AIX and
they basically all run at full speed so let me se you try that on most of
these other boxes you are talking about. Same adapter, same hardware
logic.
I have also seen what many of these other boxes you talk about do when data
or structures are not aligned on 64 bit boundaries.
The PPC HW does not have those 64bit alignment issues. So, each machine
has some warts. Have yet to see a perfect one.
If you want a lot of PCI adapters on a box, it takes a number of bridge
chips and other IO links to do that.
Memory controllers like to deal with cache lines.
For larger packets, like jumbo frames or large send (TSO), the few added
DMA's is not an issue as the packets are so large the DMA soon get aligned
and are not an issue. With TSO being the default, the small packet case
becomes less important anyway. Its more an issue on 2.4 where TSO is not
provided. We also want this to run well if someone does not want to use
TSO.
Its only the MTU 1500 case with non-TSO that we are discussing here so
copying a few bytes is really not a big deal as the data is already in
cache from copying into kernel. If it lets the adapter run at speed, thats
what customers want and what we need.
Granted, if the HW could deal with this we would not have to, but thats not
the case today so I want to spend a few CPU cycles to get best performance.
Again, if this is not done on other platforms, I don't understand why you
care.
If we have to do this for PPC port, fine. I have not seen any of you
suggest a better solution that works and will not be a worse hack to TCP or
other code. Anton tried various other ideas before we fell back to doing
it the same way we did this in AIX. This code is very localized and is
only used by platforms that need it. Thus I don't see the big issue here.
Herman
"David S. Miller" <[email protected]> on 06/14/2003 01:08:50 AM
To: [email protected]
cc: [email protected], [email protected], Herman
Dierks/Austin/IBM@IBMUS, [email protected], [email protected],
[email protected], Nancy J Milliner/Austin/IBM@IBMUS,
Ricardo C Gonzalez/Austin/IBM@ibmus, Brian
Twichell/Austin/IBM@IBMUS, [email protected]
Subject: Re: e1000 performance hack for ppc64 (Power4)
From: Lincoln Dale <[email protected]>
Date: Sat, 14 Jun 2003 15:52:35 +1000
can we have the TCP retransmit side take a performance hit if it needs
to
realign buffers?
You don't understand, the person who mangles the packet
must make the copy, not the person not doing the packet
modifications.
for a "high performance app" requiring gigabit-type speeds,
...we probably won't be using ppc64 and e1000 cards, yes, I agree
:-)
Anton, go to the local computer store and pick up some tg3
cards or a bunch of Taiwan specials :-)
From: "Herman Dierks" <[email protected]>
Date: Sun, 15 Jun 2003 09:40:41 -0500
With TSO being the default, the small packet case
becomes less important anyway.
This is a very narrow and unrealistic view of the situation.
Every third packet your system will process for any connection will be
an ACK, a small packet. Most database and web and database
transactions happen using small packets for the transaction request.
Look, if you're gonna sit here and just rant justifying this bogus
behavior of your hardware, it is likely to go in one ear and out the
other. Nobody wants to hear excuses. :)
The fact is, this system handles sub-cacheline reads inefficiently
even if a sequences of transactions are consequetive and to the same
cache line and no coherency transactions occur to that cache line.
That is dumb, and there is no arguing around this. You would be
sensible to realize this, and accept it whilst others try to help you
find a solution for your problem.
Herman Dierks wrote:
> Look folks, we run 40 to 48 GigE adapters in a p690 32 way on AIX and
> they basically all run at full speed so let me se you try that on most of
> these other boxes you are talking about. Same adapter, same hardware
> logic.
FWIW, I think that's pretty cool. Not easy to do. :)
> For larger packets, like jumbo frames or large send (TSO), the few added
> DMA's is not an issue as the packets are so large the DMA soon get aligned
> and are not an issue. With TSO being the default, the small packet case
> becomes less important anyway. Its more an issue on 2.4 where TSO is not
> provided. We also want this to run well if someone does not want to use
> TSO.
Slightly off-topic, but TSO being enabled and TSO being used are
two different things, right? Ditto jumbo frames..How often is this
the actual env in real world situations? I'm concerned that this
is more typical of development testing/performance testing
environments.
> Its only the MTU 1500 case with non-TSO that we are discussing here so
Which still is the pretty important case, I think..
> copying a few bytes is really not a big deal as the data is already in
> cache from copying into kernel. If it lets the adapter run at speed, thats
> what customers want and what we need.
Yep.
> Granted, if the HW could deal with this we would not have to, but thats not
> the case today so I want to spend a few CPU cycles to get best performance.
> Again, if this is not done on other platforms, I don't understand why you
> care.
Still would be nice to put in the best solution possible, i.e.
address it for the broadest set of affected pieces and minimizing
the impact..
> If we have to do this for PPC port, fine. I have not seen any of you
Hope it doesn't have to come to that..It would be nice to
see it in the mainline kernel. Regardless of platform, distro, etc..
these users are still people who are taking the time, the effort to
adopt Linux and sometimes in environments and situations that are
pretty critical. Change and innovation are difficult activities
to engage in some places, and anything we can do to make this
a no-brainer solution for them, and their decisions shine, thats
gotta be worth something to go the extra mile for :)
thanks,
Nivedita
Herman wrote:
> Its only the MTU 1500 case with non-TSO that we are
> discussing here so copying a few bytes is really not a big
> deal as the data is already in cache from copying into
> kernel. If it lets the adapter run at speed, thats what
> customers want and what we need. Granted, if the HW could
> deal with this we would not have to, but thats not the case
> today so I want to spend a few CPU cycles to get best
> performance. Again, if this is not done on other platforms, I
> don't understand why you care.
I care because adding the arch-specific hack creates a maintenance issue
for me.
-scott
On Mon, 2003-06-16 at 11:21, Feldman, Scott wrote:
> Herman wrote:
> > Its only the MTU 1500 case with non-TSO that we are
> > discussing here so copying a few bytes is really not a big
> > deal as the data is already in cache from copying into
> > kernel. If it lets the adapter run at speed, thats what
> > customers want and what we need. Granted, if the HW could
> > deal with this we would not have to, but thats not the case
> > today so I want to spend a few CPU cycles to get best
> > performance. Again, if this is not done on other platforms, I
> > don't understand why you care.
>
> I care because adding the arch-specific hack creates a maintenance issue
> for
Scott, would you be pleased if something was implemented out of the
driver, in generic net code? Something that all the drivers could use,
even if nothing but e1000 used it for now.
--
Dave Hansen
[email protected]
> Scott, would you be pleased if something was implemented out
> of the driver, in generic net code? Something that all the
> drivers could use, even if nothing but e1000 used it for now.
I suppose the driver could unconditionally call something like
skb_realign_for_broken_hw, which is a nop on non-broken archs, but would
it make more sense to not have the driver mess with the skb at all?
-scott