2006-08-11 17:03:44

by linas

[permalink] [raw]
Subject: [PATCH 0/4]: powerpc/cell spidernet ethernet driver fixes


The following series of patches implement some fixes and performance
improvements for the Spedernet ethernet device driver. The high point
of the patch series is some code to implement a low-watermark interrupt
on the transmit queue. The bundle of patches raises transmit performance
from some embarassingly low value to a reasonable 730 Megabits per
second for 1500 byte packets.

Please note that the spider is an ethernet controller that is
integrated into the south bridge, and is thus available only on
Cell-based platforms.

These have been well-tested over the last few weeks. Please apply.

--linas


2006-08-11 17:06:33

by linas

[permalink] [raw]
Subject: [PATCH 1/4]: powerpc/cell spidernet burst alignment patch.



This patch increases the Burst Address alignment from 64 to 1024 in the
Spidernet driver. This improves transmit performance for large packets
from about 100Mbps to 300-400Mbps.

Signed-off-by: James K Lewis <[email protected]>
Signed-off-by: Linas Vepstas <[email protected]>
Cc: Utz Bacher <[email protected]>
Cc: Jens Osterkamp <[email protected]>
Cc: Arnd Bergmann <[email protected]>

----
drivers/net/spider_net.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.h
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.h 2006-08-07 14:37:10.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.h 2006-08-11 11:09:57.000000000 -0500
@@ -209,7 +209,7 @@ extern char spider_net_driver_name[];
#define SPIDER_NET_DMA_RX_FEND_VALUE 0x00030003
/* to set TX_DMA_EN */
#define SPIDER_NET_TX_DMA_EN 0x80000000
-#define SPIDER_NET_GDTDCEIDIS 0x00000002
+#define SPIDER_NET_GDTDCEIDIS 0x00000302
#define SPIDER_NET_DMA_TX_VALUE SPIDER_NET_TX_DMA_EN | \
SPIDER_NET_GDTDCEIDIS
#define SPIDER_NET_DMA_TX_FEND_VALUE 0x00030003

2006-08-11 17:08:26

by linas

[permalink] [raw]
Subject: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.



Implement basic low-watermark support for the transmit queue.

The basic idea of a low-watermark interrupt is as follows.
The device driver queues up a bunch of packets for the hardware
to transmit, and then kicks he hardware to get it started.
As the hardware drains the queue of pending, untransmitted
packets, the device driver will want to know when the queue
is almost empty, so that it can queue some more packets.

This is accomplished by setting the DESCR_TXDESFLG flag in
one of the packets. When the hardware sees this flag, it will
interrupt the device driver. Because this flag is on a fixed
packet, rather than at fixed location in the queue, the
code below needs to move the flag as more packets are
queued up. This implementation attempts to keep te flag
at about 3/4's of the way into the queue.

This patch boosts driver performance from about
300-400Mbps for 1500 byte packets, to about 710-740Mbps.


Signed-off-by: Linas Vepstas <[email protected]>
Signed-off-by: James K Lewis <[email protected]>
Cc: Utz Bacher <[email protected]>
Cc: Jens Osterkamp <[email protected]>
Cc: Arnd Bergmann <[email protected]>

----
drivers/net/spider_net.c | 56 ++++++++++++++++++++++++++++++++++++++++++-----
drivers/net/spider_net.h | 6 +++--
2 files changed, 55 insertions(+), 7 deletions(-)

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.c
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.c 2006-08-07 14:39:38.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.c 2006-08-11 11:23:24.000000000 -0500
@@ -700,6 +700,39 @@ spider_net_release_tx_descr(struct spide
dev_kfree_skb_any(skb);
}

+static void
+spider_net_set_low_watermark(struct spider_net_card *card)
+{
+ int status;
+ int cnt=0;
+ int i;
+ struct spider_net_descr *descr = card->tx_chain.tail;
+
+ /* Measure the length of the queue. */
+ while (descr != card->tx_chain.head) {
+ status = descr->dmac_cmd_status & SPIDER_NET_DESCR_NOT_IN_USE;
+ if (status == SPIDER_NET_DESCR_NOT_IN_USE)
+ break;
+ descr = descr->next;
+ cnt++;
+ }
+ if (cnt == 0)
+ return;
+
+ /* Set low-watermark 3/4th's of the way into the queue. */
+ descr = card->tx_chain.tail;
+ cnt = (cnt*3)/4;
+ for (i=0;i<cnt; i++)
+ descr = descr->next;
+
+ /* Set the new watermark, clear the old wtermark */
+ descr->dmac_cmd_status |= SPIDER_NET_DESCR_TXDESFLG;
+ if (card->low_watermark && card->low_watermark != descr)
+ card->low_watermark->dmac_cmd_status =
+ card->low_watermark->dmac_cmd_status & ~SPIDER_NET_DESCR_TXDESFLG;
+ card->low_watermark = descr;
+}
+
/**
* spider_net_release_tx_chain - processes sent tx descriptors
* @card: adapter structure
@@ -717,6 +750,7 @@ spider_net_release_tx_chain(struct spide
{
struct spider_net_descr_chain *chain = &card->tx_chain;
int status;
+ int rc=0;

spider_net_read_reg(card, SPIDER_NET_GDTDMACCNTR);

@@ -729,8 +763,10 @@ spider_net_release_tx_chain(struct spide
break;

case SPIDER_NET_DESCR_CARDOWNED:
- if (!brutal)
- return 1;
+ if (!brutal) {
+ rc = 1;
+ goto done;
+ }
/* fallthrough, if we release the descriptors
* brutally (then we don't care about
* SPIDER_NET_DESCR_CARDOWNED) */
@@ -747,12 +783,15 @@ spider_net_release_tx_chain(struct spide

default:
card->netdev_stats.tx_dropped++;
- return 1;
+ rc = 1;
+ goto done;
}
spider_net_release_tx_descr(card);
}
-
- return 0;
+done:
+ if (rc == 1)
+ spider_net_set_low_watermark(card);
+ return rc;
}

/**
@@ -1453,6 +1492,10 @@ spider_net_interrupt(int irq, void *ptr,
spider_net_rx_irq_off(card);
netif_rx_schedule(netdev);
}
+ if (status_reg & SPIDER_NET_TXINT ) {
+ spider_net_cleanup_tx_ring(card);
+ netif_wake_queue(netdev);
+ }

if (status_reg & SPIDER_NET_ERRINT )
spider_net_handle_error_irq(card, status_reg);
@@ -1615,6 +1658,9 @@ spider_net_open(struct net_device *netde
card->descr,
PCI_DMA_TODEVICE, tx_descriptors))
goto alloc_tx_failed;
+
+ card->low_watermark = NULL;
+
if (spider_net_init_chain(card, &card->rx_chain,
card->descr + tx_descriptors,
PCI_DMA_FROMDEVICE, rx_descriptors))
Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.h
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.h 2006-08-11 11:09:57.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.h 2006-08-11 11:19:47.000000000 -0500
@@ -47,7 +47,7 @@ extern char spider_net_driver_name[];
#define SPIDER_NET_TX_DESCRIPTORS_MIN 16
#define SPIDER_NET_TX_DESCRIPTORS_MAX 512

-#define SPIDER_NET_TX_TIMER 20
+#define SPIDER_NET_TX_TIMER (HZ/5)

#define SPIDER_NET_RX_CSUM_DEFAULT 1

@@ -209,7 +209,7 @@ extern char spider_net_driver_name[];
#define SPIDER_NET_DMA_RX_FEND_VALUE 0x00030003
/* to set TX_DMA_EN */
#define SPIDER_NET_TX_DMA_EN 0x80000000
-#define SPIDER_NET_GDTDCEIDIS 0x00000302
+#define SPIDER_NET_GDTDCEIDIS 0x00000300
#define SPIDER_NET_DMA_TX_VALUE SPIDER_NET_TX_DMA_EN | \
SPIDER_NET_GDTDCEIDIS
#define SPIDER_NET_DMA_TX_FEND_VALUE 0x00030003
@@ -349,6 +349,7 @@ enum spider_net_int2_status {
#define SPIDER_NET_DESCR_FORCE_END 0x50000000 /* used in rx and tx */
#define SPIDER_NET_DESCR_CARDOWNED 0xA0000000 /* used in rx and tx */
#define SPIDER_NET_DESCR_NOT_IN_USE 0xF0000000
+#define SPIDER_NET_DESCR_TXDESFLG 0x00800000

struct spider_net_descr {
/* as defined by the hardware */
@@ -424,6 +425,7 @@ struct spider_net_card {

struct spider_net_descr_chain tx_chain;
struct spider_net_descr_chain rx_chain;
+ struct spider_net_descr *low_watermark;

struct net_device_stats netdev_stats;

2006-08-11 17:11:21

by linas

[permalink] [raw]
Subject: [PATCH 4/4]: powerpc/cell spidernet ethtool -i version number info.



This patch adds version information as reported by
ethtool -i to the Spidernet driver.

Signed-off-by: James K Lewis <[email protected]>
Signed-off-by: Linas Vepstas <[email protected]>
Cc: Utz Bacher <[email protected]>
Cc: Jens Osterkamp <[email protected]>

----
drivers/net/spider_net.c | 3 +++
drivers/net/spider_net.h | 2 ++
drivers/net/spider_net_ethtool.c | 2 +-
3 files changed, 6 insertions(+), 1 deletion(-)

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.c
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.c 2006-08-11 11:34:16.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.c 2006-08-11 11:38:48.000000000 -0500
@@ -55,6 +55,7 @@ MODULE_AUTHOR("Utz Bacher <utz.bacher@de
"<[email protected]>");
MODULE_DESCRIPTION("Spider Southbridge Gigabit Ethernet driver");
MODULE_LICENSE("GPL");
+MODULE_VERSION(VERSION);

static int rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_DEFAULT;
static int tx_descriptors = SPIDER_NET_TX_DESCRIPTORS_DEFAULT;
@@ -2293,6 +2294,8 @@ static struct pci_driver spider_net_driv
*/
static int __init spider_net_init(void)
{
+ printk("spidernet Version %s.\n",VERSION);
+
if (rx_descriptors < SPIDER_NET_RX_DESCRIPTORS_MIN) {
rx_descriptors = SPIDER_NET_RX_DESCRIPTORS_MIN;
pr_info("adjusting rx descriptors to %i.\n", rx_descriptors);
Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.h
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.h 2006-08-11 11:19:47.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.h 2006-08-11 11:38:48.000000000 -0500
@@ -24,6 +24,8 @@
#ifndef _SPIDER_NET_H
#define _SPIDER_NET_H

+#define VERSION "1.1 A"
+
#include "sungem_phy.h"

extern int spider_net_stop(struct net_device *netdev);
Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net_ethtool.c
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net_ethtool.c 2006-06-17 20:49:35.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net_ethtool.c 2006-08-11 11:38:48.000000000 -0500
@@ -55,7 +55,7 @@ spider_net_ethtool_get_drvinfo(struct ne
/* clear and fill out info */
memset(drvinfo, 0, sizeof(struct ethtool_drvinfo));
strncpy(drvinfo->driver, spider_net_driver_name, 32);
- strncpy(drvinfo->version, "0.1", 32);
+ strncpy(drvinfo->version, VERSION, 32);
strcpy(drvinfo->fw_version, "no information");
strncpy(drvinfo->bus_info, pci_name(card->pdev), 32);
}

2006-08-11 17:10:49

by linas

[permalink] [raw]
Subject: [PATCH 3/4]: powerpc/cell spidernet stop error printing patch.


Turn off mis-interpretation of the queue-empty interrupt
status bit as an error. This bit is set as a part of
the previous low-watermark patch.

Signed-off-by: Linas Vepstas <[email protected]>
Signed-off-by: James K Lewis <[email protected]>
Cc: Utz Bacher <[email protected]>
Cc: Jens Osterkamp <[email protected]>
Cc: Arnd Bergmann <[email protected]>

----
drivers/net/spider_net.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.c
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.c 2006-08-11 11:23:24.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.c 2006-08-11 11:34:16.000000000 -0500
@@ -1275,12 +1275,15 @@ spider_net_handle_error_irq(struct spide
case SPIDER_NET_PHYINT:
case SPIDER_NET_GMAC2INT:
case SPIDER_NET_GMAC1INT:
- case SPIDER_NET_GIPSINT:
case SPIDER_NET_GFIFOINT:
case SPIDER_NET_DMACINT:
case SPIDER_NET_GSYSINT:
break; */

+ case SPIDER_NET_GIPSINT:
+ show_error = 0;
+ break;
+
case SPIDER_NET_GPWOPCMPINT:
/* PHY write operation completed */
show_error = 0;
@@ -1339,9 +1342,10 @@ spider_net_handle_error_irq(struct spide
case SPIDER_NET_GDTDCEINT:
/* chain end. If a descriptor should be sent, kick off
* tx dma
- if (card->tx_chain.tail == card->tx_chain.head)
+ if (card->tx_chain.tail != card->tx_chain.head)
spider_net_kick_tx_dma(card);
- show_error = 0; */
+ */
+ show_error = 0;
break;

/* case SPIDER_NET_G1TMCNTINT: not used. print a message */
@@ -1455,8 +1459,9 @@ spider_net_handle_error_irq(struct spide
}

if ((show_error) && (netif_msg_intr(card)))
- pr_err("Got error interrupt, GHIINT0STS = 0x%08x, "
+ pr_err("Got error interrupt on %s, GHIINT0STS = 0x%08x, "
"GHIINT1STS = 0x%08x, GHIINT2STS = 0x%08x\n",
+ card->netdev->name,
status_reg, error_reg1, error_reg2);

/* clear interrupt sources */

2006-08-11 17:42:59

by Joel Schopp

[permalink] [raw]
Subject: Re: [PATCH 0/4]: powerpc/cell spidernet ethernet driver fixes

Linas Vepstas wrote:
> The following series of patches implement some fixes and performance
> improvements for the Spedernet ethernet device driver. The high point
> of the patch series is some code to implement a low-watermark interrupt
> on the transmit queue. The bundle of patches raises transmit performance
> from some embarassingly low value to a reasonable 730 Megabits per
> second for 1500 byte packets.
>
> Please note that the spider is an ethernet controller that is
> integrated into the south bridge, and is thus available only on
> Cell-based platforms.
>
> These have been well-tested over the last few weeks. Please apply.

With these patches the spidernet driver performance goes from bad to usable. They are all
good changes. I'd expect some more bottlnecks to be identified now that these are
cleared. Maybe Jim and Linas can get the driver performance up from usable to good next.

Acked-by: Joel Schopp <[email protected]>

2006-08-11 17:45:13

by Sam Ravnborg

[permalink] [raw]
Subject: Re: [PATCH 0/4]: powerpc/cell spidernet ethernet driver fixes

On Fri, Aug 11, 2006 at 12:03:37PM -0500, Linas Vepstas wrote:
>
> The following series of patches implement some fixes and performance
> improvements for the Spedernet ethernet device driver. The high point
> of the patch series is some code to implement a low-watermark interrupt
> on the transmit queue. The bundle of patches raises transmit performance
> from some embarassingly low value to a reasonable 730 Megabits per
> second for 1500 byte packets.
>
> Please note that the spider is an ethernet controller that is
> integrated into the south bridge, and is thus available only on
> Cell-based platforms.
>
> These have been well-tested over the last few weeks. Please apply.
Hi Linas.
Just noticed a nit-pick detail.
The general rule is to add your Signed-off-by: at the bottom of the
patch, so the top-most Signed-of-by: is also the original author whereas
the last Signed-of-by: is the one that added this patch to the kernel.
Likewise you add Cc: before your Signed-off-by: line.

See patches from for example Andrew Morton for examples.

Sam

2006-08-11 18:04:13

by Olof Johansson

[permalink] [raw]
Subject: Re: [PATCH 4/4]: powerpc/cell spidernet ethtool -i version number info.

On Fri, Aug 11, 2006 at 12:11:17PM -0500, Linas Vepstas wrote:

> This patch adds version information as reported by
> ethtool -i to the Spidernet driver.

Why does a driver that's in the mainline kernel need to have a version
number besides the kernel version?

I can understand it for drivers like e1000 that Intel maintain outside
of the kernel as well. But spidernet is a fully mainline maintained
driver, right?


-Olof

2006-08-11 19:31:08

by linas

[permalink] [raw]
Subject: Re: [PATCH 0/4]: powerpc/cell spidernet ethernet driver fixes

On Fri, Aug 11, 2006 at 07:44:39PM +0200, Sam Ravnborg wrote:
> >
> > These have been well-tested over the last few weeks. Please apply.
> Hi Linas.
> Just noticed a nit-pick detail.
> The general rule is to add your Signed-off-by: at the bottom of the
> patch, so the top-most Signed-of-by: is also the original author whereas
> the last Signed-of-by: is the one that added this patch to the kernel.

I put my name at the top when I was the primary author.
I put Jim's name at the top when he was the primary author.

Both names are there because I sat in Jim's office and used
his keyboard. I got him to compile and run the tests on
his hardware, and we'd then debate the results.

> Likewise you add Cc: before your Signed-off-by: line.

The patches I ave from akpm have the CC's after the
signed-off by line, not before.

--linas

2006-08-11 19:46:54

by linas

[permalink] [raw]
Subject: Re: [PATCH 4/4]: powerpc/cell spidernet ethtool -i version number info.


Hi Olof,

Olof Johansson <[email protected]> writes:
> On Fri, Aug 11, 2006 at 12:11:17PM -0500, Linas Vepstas wrote:
>
> > This patch adds version information as reported by
> > ethtool -i to the Spidernet driver.
>
> Why does a driver that's in the mainline kernel need to have a version
> number besides the kernel version?

I'll let Jim be the primary defender. From what I can tell, "that's the
way its done". For example:

linux-2.6.18-rc3-mm2 $ grep MODULE_VERSION */*/*.c |wc
164 245 9081

> I can understand it for drivers like e1000 that Intel maintain outside
> of the kernel as well. But spidernet is a fully mainline maintained
> driver, right?

Yes, the spidernet is a Linux-kernel only driver.

--linas

p.s. very strange, but I did not see your original email;
only saw Jim's reply.


2006-08-11 20:28:12

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 0/4]: powerpc/cell spidernet ethernet driver fixes

On Friday 11 August 2006 21:31, Linas Vepstas wrote:
> I put my name at the top when I was the primary author.
> I put Jim's name at the top when he was the primary author.
>
> Both names are there because I sat in Jim's office and used
> his keyboard. I got him to compile and run the tests on
> his hardware, and we'd then debate the results.

When the patch gets added to a git repository, they end up
having your name on it, because the author is determined
from the person who sent the patch.

For the patches where Jim is the main author, you should put a

From: James K Lewis <[email protected]>

into the first line of the email body. That will make the
scripts do the right thing. The order of the Signed-off-by:
lines is used is independant from authorship and should
list the name of the submitter last.

Arnd <><

2006-08-15 19:10:21

by Olof Johansson

[permalink] [raw]
Subject: Re: [PATCH 4/4]: powerpc/cell spidernet ethtool -i version number info.

On Fri, Aug 11, 2006 at 01:50:19PM -0500, James K Lewis wrote:
> Hi Olof,
>
> There are several reasons why an Ethernet driver should have an up to
> date version number:
>
> 1. Customers like to see they are really getting a new version.
>
> 2. It makes it easier for support personnel (me in this case) to see which
> driver they have. Sure, sometimes I can talk them thru doing a "sum" on
> the .ko and all that, but why not just use the version number? That's what
> it is for. And no, you can't just assume they have the version that came
> with the kernel they are running. It doesn't work that way.
>
> 3. It makes bug reporting easier.
>
> 4. I have already run into too many problems and wasted too much time
> working with drivers when the number was NOT getting updated.

Thanks for the info, Jim.

Sounds like it's most useful if a customer (or distro) takes the driver
out of the tree and run it with a different kernel, i.e. when kernel
and driver versions no longer go together. Makes sense.


-Olof

2006-08-16 00:29:24

by Michael Ellerman

[permalink] [raw]
Subject: Re: [PATCH 4/4]: powerpc/cell spidernet ethtool -i version number info.

On Tue, 2006-08-15 at 14:05 -0500, Olof Johansson wrote:
> On Fri, Aug 11, 2006 at 01:50:19PM -0500, James K Lewis wrote:
> > Hi Olof,
> >
> > There are several reasons why an Ethernet driver should have an up to
> > date version number:
> >
> > 1. Customers like to see they are really getting a new version.
> >
> > 2. It makes it easier for support personnel (me in this case) to see which
> > driver they have. Sure, sometimes I can talk them thru doing a "sum" on
> > the .ko and all that, but why not just use the version number? That's what
> > it is for. And no, you can't just assume they have the version that came
> > with the kernel they are running. It doesn't work that way.
> >
> > 3. It makes bug reporting easier.
> >
> > 4. I have already run into too many problems and wasted too much time
> > working with drivers when the number was NOT getting updated.
>
> Thanks for the info, Jim.
>
> Sounds like it's most useful if a customer (or distro) takes the driver
> out of the tree and run it with a different kernel, i.e. when kernel
> and driver versions no longer go together. Makes sense.

It only makes sense in addition to the kernel version number (which in
some cases can be meaningless), plus any distro and/or local patches,
plus the kernel config.

Without all that information you don't really know what you're talking
about, because any one of the many interfaces between the driver and the
core kernel may have changed.

So in practice I find it's much simpler to just get the exact source
that they're running, rather than trying to guess based on version
numbers. But that's just me :)

cheers

--
Michael Ellerman
IBM OzLabs

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person


Attachments:
signature.asc (191.00 B)
This is a digitally signed message part

2006-08-16 16:19:04

by linas

[permalink] [raw]
Subject: [PATCH 1/2]: powerpc/cell spidernet bottom half


Please apply and forward upstream. This patch requires the previous
sequence of 4 spidernet patches to be applied.

--linas


The recent set of low-waterark patches for the spider result in a
significant amount of computing being done in an interrupt context.
This patch moves this to a "bottom half" aka work queue, so that
the code runs in a normal kernel context. Curiously, this seems to
result in a performance boost of about 5% for large packets.

Signed-off-by: Linas Vepstas <[email protected]>
Cc: Utz Bacher <[email protected]>
Cc: Jens Osterkamp <[email protected]>

----
drivers/net/spider_net.c | 21 +++++++++++++++------
drivers/net/spider_net.h | 4 +++-
2 files changed, 18 insertions(+), 7 deletions(-)

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.c
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.c 2006-08-15 14:25:50.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.c 2006-08-15 14:28:56.000000000 -0500
@@ -883,9 +883,10 @@ out:
* spider_net_cleanup_tx_ring - cleans up the TX ring
* @card: card structure
*
- * spider_net_cleanup_tx_ring is called by the tx_timer (as we don't use
- * interrupts to cleanup our TX ring) and returns sent packets to the stack
- * by freeing them
+ * spider_net_cleanup_tx_ring is called by either the tx_timer
+ * or from a work-queue scheduled by the tx-empty interrupt.
+ * This routine releases resources associted with transmitted
+ * packets, including updating the queue tail pointer.
*/
static void
spider_net_cleanup_tx_ring(struct spider_net_card *card)
@@ -895,12 +896,20 @@ spider_net_cleanup_tx_ring(struct spider
spin_lock_irqsave(&card->tx_chain.lock, flags);

if ((spider_net_release_tx_chain(card, 0) != 0) &&
- (card->netdev->flags & IFF_UP))
+ (card->netdev->flags & IFF_UP)) {
spider_net_kick_tx_dma(card);
+ netif_wake_queue(card->netdev);
+ }

spin_unlock_irqrestore(&card->tx_chain.lock, flags);
}

+static void
+spider_net_tx_cleanup_task(void * data)
+{
+ spider_net_cleanup_tx_ring((struct spider_net_card *) data);
+}
+
/**
* spider_net_do_ioctl - called for device ioctls
* @netdev: interface device structure
@@ -1499,8 +1508,7 @@ spider_net_interrupt(int irq, void *ptr,
netif_rx_schedule(netdev);
}
if (status_reg & SPIDER_NET_TXINT ) {
- spider_net_cleanup_tx_ring(card);
- netif_wake_queue(netdev);
+ schedule_work(&card->tx_cleanup_task);
}

if (status_reg & SPIDER_NET_ERRINT )
@@ -2117,6 +2125,7 @@ spider_net_alloc_card(void)
card->netdev = netdev;
card->msg_enable = SPIDER_NET_DEFAULT_MSG;
INIT_WORK(&card->tx_timeout_task, spider_net_tx_timeout_task, netdev);
+ INIT_WORK(&card->tx_cleanup_task, spider_net_tx_cleanup_task, card);
init_waitqueue_head(&card->waitq);
atomic_set(&card->tx_timeout_task_counter, 0);

Index: linux-2.6.18-rc3-mm2/drivers/net/spider_net.h
===================================================================
--- linux-2.6.18-rc3-mm2.orig/drivers/net/spider_net.h 2006-08-15 14:25:50.000000000 -0500
+++ linux-2.6.18-rc3-mm2/drivers/net/spider_net.h 2006-08-15 14:28:56.000000000 -0500
@@ -24,7 +24,7 @@
#ifndef _SPIDER_NET_H
#define _SPIDER_NET_H

-#define VERSION "1.1 A"
+#define VERSION "1.1 B"

#include "sungem_phy.h"

@@ -441,6 +441,8 @@ struct spider_net_card {
atomic_t tx_timeout_task_counter;
wait_queue_head_t waitq;

+ struct work_struct tx_cleanup_task;
+
/* for ethtool */
int msg_enable;

2006-08-16 16:30:53

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Linas Vepstas wrote:
> Please apply and forward upstream. This patch requires the previous
> sequence of 4 spidernet patches to be applied.
>
> --linas
>
>
> The recent set of low-waterark patches for the spider result in a
> significant amount of computing being done in an interrupt context.
> This patch moves this to a "bottom half" aka work queue, so that
> the code runs in a normal kernel context. Curiously, this seems to
> result in a performance boost of about 5% for large packets.
>
> Signed-off-by: Linas Vepstas <[email protected]>
> Cc: Utz Bacher <[email protected]>
> Cc: Jens Osterkamp <[email protected]>

Let's not reinvented NAPI, shall we...

Jeff



2006-08-16 20:31:11

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Wed, Aug 16, 2006 at 12:30:29PM -0400, Jeff Garzik wrote:
> Linas Vepstas wrote:
> >
> >The recent set of low-waterark patches for the spider result in a
>
> Let's not reinvented NAPI, shall we...

??

I was under the impression that NAPI was for the receive side only.
This round of patches were for the transmit queue.

Let me describe the technical problem; perhaps there's some other
solution for it?

The default socket size seems to be 128KB; (cat
/proc/sys/net/core/wmem_default) if a user application
writes more than 128 KB to a socket, the app is blocked by the
kernel till there's room in the socket for more. At gigabit speeds,
a network card can drain 128KB in about a millisecond, or about
four times a jiffy (assuming HZ=250). If the network card isn't
generaing interrupts, (and there are no other interrupts flying
around) then the tcp stack only wakes up once a jiffy, and so
the user app is scheduled only once a jiffy. Thus, the max
bandwidth that the app can see is (HZ * wmem_default) bytes per
second, or about 250 Mbits/sec for my system. Disappointing
for a gigabit adapter.

There's three ways out of this:

(1) tell the sysadmin to
"echo 1234567 > /proc/sys/net/core/wmem_default" which
violates all the rules.

(2) Poll more frequently than once-a-jiffy. Arnd Bergmann and I
got this working, using hrtimers. It worked pretty well,
but seemed like a hack to me.

(3) Generate transmit queue low-watermark interrupts,
which is an admitedly olde-fashioned but common
engineering practice. This round of patches implement
this.


--linas


2006-08-16 20:34:39

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Linas Vepstas wrote:
> I was under the impression that NAPI was for the receive side only.

That depends on the driver implementation.

Jeff


2006-08-16 20:46:41

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

From: Jeff Garzik <[email protected]>
Date: Wed, 16 Aug 2006 16:34:31 -0400

> Linas Vepstas wrote:
> > I was under the impression that NAPI was for the receive side only.
>
> That depends on the driver implementation.

What Jeff is trying to say is that TX reclaim can occur in
the NAPI poll routine, and in fact this is what the vast
majority of NAPI drivers do.

It also makes the locking simpler.

In practice, the best thing seems to be to put both RX and TX
work into ->poll() and have a very mild hw interrupt mitigation
setting programmed into the chip.

I'm not familiar with the spidernet TX side interrupt capabilities
so I can't say whether that is something that can be directly
implied. In fact, I get the impression that spidernet is limited
in some way and that's where all the strange approaches are coming
from :)

2006-08-16 21:27:33

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Am Wednesday 16 August 2006 22:46 schrieb David Miller:
> I'm not familiar with the spidernet TX side interrupt capabilities
> so I can't say whether that is something that can be directly
> implied. ?In fact, I get the impression that spidernet is limited
> in some way and that's where all the strange approaches are coming
> from :)

Actually, the capabilities of the chip are quite powerful, it only
seems to be hard to make it go fast using any of them. That may
be the fault of strange locking rules and other bugs we had in
the driver before, so maybe you can recommend which one to use.

Cleaning up the TX queue only from ->poll() like all the others
sounds like the right approach to simplify the code.

The spider hardware offers at least these options:

- end of TX queue interrupt
- set a per-descriptor bit to fire an interrupt at a specific frame
- an interrupt for each frame (may be multiple descriptors)
- an interrupt for each descriptor
- timers implemented in the spidernet hardware

We first had an interrupt per descriptor, then got rid of all TX
interrupts and replaced them by timers to reduce the interrupt load,
but reducing throughput in the case where user space sleeps on a full
socket buffer.

The last patches that were suggested introduce marking a single descriptor
(not the last one, but somewhere near the end of the queue) so we fire
an interrupt just before the TX queue gets empty.

Arnd <><

2006-08-16 21:32:48

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

From: Arnd Bergmann <[email protected]>
Date: Wed, 16 Aug 2006 23:24:46 +0200

> We first had an interrupt per descriptor, then got rid of all TX
> interrupts and replaced them by timers to reduce the interrupt load,
> but reducing throughput in the case where user space sleeps on a full
> socket buffer.

The best schemes seem to be to interrupt mitigate using a combination
of time and number of TX entries pending to be purged. This is what
most gigabit chips seem to offer.

On Tigon3, for example, we tell the chip to interrupt if either 53
frames or 150usecs have passed since the first TX packet has become
available for reclaim.

That bounds the latency as well as force the interrupt if a lot of TX
work becomes available.

Can spidernet be told these kinds of parameters? "N packets or
X usecs"?

This is all controllable via ethtool btw (via ETHTOOL_{S,G}COALESCE),
so you can experiment if you want.

2006-08-16 21:58:42

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Wed, Aug 16, 2006 at 01:46:40PM -0700, David Miller wrote:
> From: Jeff Garzik <[email protected]>
> Date: Wed, 16 Aug 2006 16:34:31 -0400
>
> > Linas Vepstas wrote:
> > > I was under the impression that NAPI was for the receive side only.
> >
> > That depends on the driver implementation.
>
> What Jeff is trying to say is that TX reclaim can occur in
> the NAPI poll routine, and in fact this is what the vast
> majority of NAPI drivers do.

I'll experiment with this. When doing, say, an ftp, there are
enough TCP ack packets coming back to have NAPI netdev->poll
be called frequently enough?

> implied. In fact, I get the impression that spidernet is limited
> in some way and that's where all the strange approaches are coming
> from :)

Hmm. Or maybe I'm just getting old. Once upon a time, low watermarks
were considered the "best" way of doing anything; never occurred to me
it would be considered "strange". Based on my probably obsolete idea
of what constitutes "slick hardware", I was actually impressed by what
the spidernet could do.

Aside from cleaning up the transmit ring in the receive poll loop,
what would be the not-so-strange way of doing things?

--linas


2006-08-16 22:17:07

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Am Wednesday 16 August 2006 23:32 schrieb David Miller:
> Can spidernet be told these kinds of parameters? ?"N packets or
> X usecs"?

It can not do exactly this but probably we can get close to it by

1.) Setting a one-time interrupt to fire x*10?s after putting a
packet into the TX queue.

and

2.a) Enabling end of TX queue interrupts whenever more than y frames
have been queued for TX.

or

2.b) Marking frame number y in the TX to fire an interrupt when it
has been transmitted, and move the mark whenever we clean up
TX descriptors.

or

2.c) Marking frame number y to generate an interrupt, but counting
y from the end of the queue, and update the mark whenever we
add frames to the queue tail.

I'm not sure which version of 2. would give us the best approximation.

Arnd <><

2006-08-16 22:30:05

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

From: Arnd Bergmann <[email protected]>
Date: Thu, 17 Aug 2006 00:16:46 +0200

> Am Wednesday 16 August 2006 23:32 schrieb David Miller:
> > Can spidernet be told these kinds of parameters? ?"N packets or
> > X usecs"?
>
> It can not do exactly this but probably we can get close to it by

Oh, you can only control TX packet counts using bits in the TX ring
entries :(

Tigon3 can even be told to use different interrupt mitigation
parameters when the cpu is actively servicing an interrupt for
the chip.

Didn't you say spidernet's facilities were sophisticated? :)
This Tigon3 stuff is like 5+ year old technology.

2006-08-16 22:47:49

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Am Thursday 17 August 2006 00:29 schrieb David Miller:
> Didn't you say spidernet's facilities were sophisticated? :)
> This Tigon3 stuff is like 5+ year old technology.

I was rather overwhelmed by the 34 different interrupts that
the chip can create, that does not mean they chose the right
events for generating them.
Interestingly, spidernet has five different counters you can
set up to generate interrupts after a number of received frames,
but none for transmit...

Arnd <><

2006-08-16 22:56:04

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Wed, Aug 16, 2006 at 11:24:46PM +0200, Arnd Bergmann wrote:
>
> it only
> seems to be hard to make it go fast using any of them.

Last round of measurements seemed linear for packet sizes between
60 and 600 bytes, suggesting that the hardware can handle a
maximum of 120K descriptors/second, independent of packet size.
I don't know why this is.

> That may
> be the fault of strange locking rules

My fault; a few months ago, we were in panic mode trying to get
the thing functioning reliably, and I put locks around anything
and everything. This last patch removes those locks, and protects
only a few pointers (the incrementing of the head and the tail
pointers, and the location ofthe low watermark) that actually
needed protection. They need protection because the code can
get called in various different ways.

> Cleaning up the TX queue only from ->poll() like all the others

I'll try this ...

> sounds like the right approach to simplify the code.

Its not a big a driver. 'wc' says its 2.3 loc, which
is 1/3 or 1/5 the size of tg3.c or the e1000*c files.

--linas

2006-08-16 23:03:25

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Am Thursday 17 August 2006 00:55 schrieb Linas Vepstas:
> > it only
> > seems to be hard to make it go fast using any of them.
>
> Last round of measurements seemed linear for packet sizes between
> 60 and 600 bytes, suggesting that the hardware can handle a
> maximum of 120K descriptors/second, independent of packet size.
> I don't know why this is.

Could well be related to latencies when going to the remote
node for descriptor DMAs. Have you tried if the hch's NUMA
patch or using numactl makes a difference here?

> > sounds like the right approach to simplify the code.
>
> Its not a big a driver. 'wc' says its 2.3 loc, which
> is 1/3 or 1/5 the size of tg3.c or the e1000*c files.

Right, I was thinking of removing a lock or another, not
throwing out half of the driver ;-)

Arnd <><

2006-08-16 23:08:37

by Rick Jones

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

Linas Vepstas wrote:
> On Wed, Aug 16, 2006 at 11:24:46PM +0200, Arnd Bergmann wrote:
>
>>it only
>>seems to be hard to make it go fast using any of them.
>
>
> Last round of measurements seemed linear for packet sizes between
> 60 and 600 bytes, suggesting that the hardware can handle a
> maximum of 120K descriptors/second, independent of packet size.
> I don't know why this is.

DMA overhead perhaps? If it takes so many micro/nanoseconds to get a
DMA going.... That used to be a reason the Tigon2 had such low PPS
rates and issues with multiple buffer packets and a 1500 byte MTU - it
had rather high DMA setup latency, and then if you put it into a system
with highish DMA read/write latency... well that didn't make it any
better :)

rick jones

2006-08-16 23:24:26

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Wed, Aug 16, 2006 at 02:32:03PM -0700, David Miller wrote:
>
> The best schemes seem to be to interrupt mitigate using a combination
> of time and number of TX entries pending to be purged. This is what
> most gigabit chips seem to offer.

I seem to be having a multi-hour delay for email delivery, so maybe
we've crossed emails.

A "low watermark interrupt" is an interrupt that is generated when
some queue is "almost empty". This last set of patches implement this
for the TX queue. The interrupt pops when 3/4ths of the packets
in the queue have been processed. Playing with ths setting
(3/4ths or some other number) seemed to make little difference.

> On Tigon3, for example, we tell the chip to interrupt if either 53
> frames or 150usecs have passed since the first TX packet has become
> available for reclaim.

The nature of a low-watermark interrupt is that it NEVER pops, as long
as the kernel keeps putting more stuff into the queue, so as to keep
the queue at least 1/4'th full. I don't know how to mitigate interrupts
more than that.

--linas

2006-08-16 23:30:53

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Thu, Aug 17, 2006 at 12:16:46AM +0200, Arnd Bergmann wrote:
> Am Wednesday 16 August 2006 23:32 schrieb David Miller:
> > Can spidernet be told these kinds of parameters? ?"N packets or
> > X usecs"?
>
> It can not do exactly this but probably we can get close to it by

Why would you want o do this? It seems like a cruddier strategy
than what we can already do (which is to never get an transmit
interrupt, as long as the kernel can shove data into the device fast
enough to keep the queue from going empty.) The whole *point* of a
low-watermark interrupt is to never have to actually get the interrupt,
if the rest of the system is on its toes and is supplying data fast
enough.

--linas

2006-08-16 23:32:53

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

From: [email protected] (Linas Vepstas)
Date: Wed, 16 Aug 2006 18:30:28 -0500

> Why would you want o do this? It seems like a cruddier strategy
> than what we can already do (which is to never get an transmit
> interrupt, as long as the kernel can shove data into the device fast
> enough to keep the queue from going empty.) The whole *point* of a
> low-watermark interrupt is to never have to actually get the interrupt,
> if the rest of the system is on its toes and is supplying data fast
> enough.

As long as TX packets get freed within a certain latency
boundary, this kind of scheme should be fine.

2006-08-16 23:48:43

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, 2006-08-11 at 12:08 -0500, Linas Vepstas wrote:
>
> Implement basic low-watermark support for the transmit queue.
>
> The basic idea of a low-watermark interrupt is as follows.
> The device driver queues up a bunch of packets for the hardware
> to transmit, and then kicks he hardware to get it started.
> As the hardware drains the queue of pending, untransmitted
> packets, the device driver will want to know when the queue
> is almost empty, so that it can queue some more packets.
>
> This is accomplished by setting the DESCR_TXDESFLG flag in
> one of the packets. When the hardware sees this flag, it will
> interrupt the device driver. Because this flag is on a fixed
> packet, rather than at fixed location in the queue, the
> code below needs to move the flag as more packets are
> queued up. This implementation attempts to keep te flag
> at about 3/4's of the way into the queue.
>
> This patch boosts driver performance from about
> 300-400Mbps for 1500 byte packets, to about 710-740Mbps.

Sounds good (without actually looking at the code though :), that was a
long required improvement to that driver. Also, we should probably look
into using NAPI polling for tx completion queue as well, no ?

Cheers,
Ben.


2006-08-16 23:48:31

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Thu, Aug 17, 2006 at 01:03:20AM +0200, Arnd Bergmann wrote:
>
> Could well be related to latencies when going to the remote
> node for descriptor DMAs. Have you tried if the hch's NUMA
> patch or using numactl makes a difference here?

No. I guess I should try.

> > > sounds like the right approach to simplify the code.
> >
> > Its not a big a driver. 'wc' says its 2.3 loc, which
> > is 1/3 or 1/5 the size of tg3.c or the e1000*c files.
>
> Right, I was thinking of removing a lock or another, not
> throwing out half of the driver ;-)

There's only four lock points grand total.
-- One on the receive side,
-- one to protect the transmit head pointer,
-- one to protect the transmit tail pointer,
-- one to protect the location of the transmit low watermark.

The last three share the same lock. I tried using distinct
locks, but this worsened things, probably due to cache-line
trashing. I tried removing the head pointer lock, but this
failed. I don't know why, and was surprised by this. I thought
hard_start_xmit() was serialized.

--linas

2006-08-17 00:23:13

by linas

[permalink] [raw]
Subject: Re: [PATCH 1/2]: powerpc/cell spidernet bottom half

On Wed, Aug 16, 2006 at 04:32:52PM -0700, David Miller wrote:
> From: [email protected] (Linas Vepstas)
> > Why would you want to do this? It seems like a cruddier strategy
> > than what we can already do (which is to never get an transmit
> > interrupt, as long as the kernel can shove data into the device fast
> > enough to keep the queue from going empty.) The whole *point* of a
> > low-watermark interrupt is to never have to actually get the interrupt,
> > if the rest of the system is on its toes and is supplying data fast
> > enough.
>
> As long as TX packets get freed within a certain latency
> boundary, this kind of scheme should be fine.

I just had some fun making sure I wasn't making a liar out of myself.
So far, I'm good.

Did

echo 768111 > /proc/sys/net/core/wmem_max
echo 768111 > /proc/sys/net/core/wmem_default

to make sure that the app never blocked on a full socket.

(If the socket is small, then the app blocks,
the transmit queue drains, and we do get an interupt --
about 1K/sec, which is what is expected).

Ran 'vmstat 10' to watch the interrupts in real time.
Ran netperf, got 904 Mbits/sec, and *no* interrupts.
Yahoo!

Ran oprofile to see where he time went:

CPU: ppc64 Cell Broadband Engine, speed 3200 MHz (estimated)
Counted CYCLES events (Processor Cycles) with a unit mask of 0x00 (No unit mask) count 100000
samples % image name app name symbol name
13748742 77.6620 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .cbe_idle
936172 5.2881 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__copy_tofrom_user
569353 3.2161 spidernet.ko spidernet .spider_net_xmit
450826 2.5466 spidernet.ko spidernet .spider_net_release_tx_chain
220374 1.2448 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 ._spin_unlock_irqrestore
112432 0.6351 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 ._spin_lock
91328 0.5159 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__qdisc_run
84804 0.4790 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .packet_sendmsg
76167 0.4302 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .kfree
74321 0.4198 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sock_alloc_send_skb
65323 0.3690 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .kmem_cache_free
60334 0.3408 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 ._read_lock
60071 0.3393 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .dev_queue_xmit
56900 0.3214 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .kmem_cache_alloc_node
55281 0.3123 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sock_wfree
51242 0.2894 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .dev_get_by_index
50438 0.2849 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .compat_sys_socketcall
49247 0.2782 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .fput
48589 0.2745 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sync_buffer
46055 0.2601 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 system_call_common
40607 0.2294 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .local_bh_enable
40273 0.2275 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__might_sleep
38757 0.2189 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .fget_light
38219 0.2159 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__kfree_skb
36804 0.2079 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sock_def_write_space
36443 0.2059 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .skb_release_data
32174 0.1817 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sys_sendto
31828 0.1798 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .sock_sendmsg
30676 0.1733 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .pfifo_fast_dequeue
29607 0.1672 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .add_event_entry
25870 0.1461 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 syscall_exit
25329 0.1431 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__alloc_skb
23885 0.1349 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__do_softirq
22046 0.1245 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .kmem_find_general_cachep
21610 0.1221 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .__dev_get_by_index
21059 0.1190 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .dma_map_single
21044 0.1189 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 ._spin_lock_irqsave
20105 0.1136 vmlinux-2.6.18-rc2 vmlinux-2.6.18-rc2 .memset

.__copy_tofrom_user -- ouch spider does not currently do scatter-gather
.spider_net_xmit -- hmmm ?? why is it this large ??
.spider_net_release_tx_chain -- ?? a lot of time being spent cleaning up tx queues.
._spin_unlock_irqrestore -- hmm ? why so high? lock contention?

I presume the rest is normal.

2006-08-18 19:24:00

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Thu, Aug 17, 2006 at 01:43:40AM +0200, Benjamin Herrenschmidt wrote:
>
> Sounds good (without actually looking at the code though :), that was a
> long required improvement to that driver. Also, we should probably look
> into using NAPI polling for tx completion queue as well, no ?

Just for a lark, I tried using NAPI polling, while disabling all TX
interrupts. Performance was a disaster: 8Mbits/sec, fom which I conclude
that the tcp ack packets do not flow back fast enough to allw reliance
on NAPI polling for transmit.

I was able to get as high as 960 Mbits/sec in unusal circumstances,
at 100% cpu usage. Oprofile indicates that the next major improvement
would be to add scatter/gather, which I'll take a shot at next week,
if I don't get interrupted. However, I'm getting interrupted a lot these
days.

--linas

2006-08-18 21:25:12

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

From: [email protected] (Linas Vepstas)
Date: Fri, 18 Aug 2006 14:23:56 -0500

> On Thu, Aug 17, 2006 at 01:43:40AM +0200, Benjamin Herrenschmidt wrote:
> >
> > Sounds good (without actually looking at the code though :), that was a
> > long required improvement to that driver. Also, we should probably look
> > into using NAPI polling for tx completion queue as well, no ?
>
> Just for a lark, I tried using NAPI polling, while disabling all TX
> interrupts. Performance was a disaster: 8Mbits/sec, fom which I conclude
> that the tcp ack packets do not flow back fast enough to allw reliance
> on NAPI polling for transmit.

The idea is to use NAPI polling with TX interrupts disabled.

We're not saying to use the RX interrupt as the trigger for
RX and TX work. Rather, either of RX or TX interrupt will
schedule the NAPI poll.

2006-08-18 22:46:21

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, Aug 18, 2006 at 02:25:13PM -0700, David Miller wrote:
> From: [email protected] (Linas Vepstas)
> Date: Fri, 18 Aug 2006 14:23:56 -0500
>
> > On Thu, Aug 17, 2006 at 01:43:40AM +0200, Benjamin Herrenschmidt wrote:
> > >
> > > Sounds good (without actually looking at the code though :), that was a
> > > long required improvement to that driver. Also, we should probably look
> > > into using NAPI polling for tx completion queue as well, no ?
> >
> > Just for a lark, I tried using NAPI polling, while disabling all TX
> > interrupts. Performance was a disaster: 8Mbits/sec, fom which I conclude
> > that the tcp ack packets do not flow back fast enough to allw reliance
> > on NAPI polling for transmit.
>
> The idea is to use NAPI polling with TX interrupts disabled.

The idea of a low-watermark mark interrupt is that there are *zero*
interrupts, as long as the kernel keeps feeding packets to the device.

In my last email to you, I attached a real-life vmstat trace
showing exactly zero interrupts over a two minute period.

However, the zero-interrupt scenario only occurs if the kernel
is actually feeding packets to the driver. If the socket beffers
are small, then the app blocks, the kernel is idle, and there
are not enough tcp ack packets coming back the other way to
actually get the NAPI polling to keep the adapter fed.

> We're not saying to use the RX interrupt as the trigger for
> RX and TX work. Rather, either of RX or TX interrupt will
> schedule the NAPI poll.

And, for a lark, this is exactly what I did. Just to see.
Because there are so few ack packets, there are very few
RX interrupts -- not enough to get NAPI to actually keep
the device busy.

------
I'm somewhat disoriened from this conversation. Its presumably
clear that low-watermark mechanisms are superior to NAPI.
>From what I gather, NAPI was invented to deal with cheap
or low-function hardware; it adds nothing to this particular
situation. Why are we talking about this?

--linas

2006-08-18 22:51:16

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

From: [email protected] (Linas Vepstas)
Date: Fri, 18 Aug 2006 17:46:18 -0500

> > We're not saying to use the RX interrupt as the trigger for
> > RX and TX work. Rather, either of RX or TX interrupt will
> > schedule the NAPI poll.
>
> And, for a lark, this is exactly what I did. Just to see.
> Because there are so few ack packets, there are very few
> RX interrupts -- not enough to get NAPI to actually keep
> the device busy.

You're misreading me. TX interrupts are intended to be "enabled" and
trigger NAPI polls. TX IRQ enabled, enabled :-)

If you want to eliminate them if the kernel keeps hopping into
the ->hard_start_xmit() via hw interrupt mitigation or whatever,
that's fine. But if you do need to do TX interrupt processing,
do it in NAPI ->poll().

> I'm somewhat disoriened from this conversation. Its presumably
> clear that low-watermark mechanisms are superior to NAPI.
> >From what I gather, NAPI was invented to deal with cheap
> or low-function hardware; it adds nothing to this particular
> situation. Why are we talking about this?

NAPI is meant to give fairness to all devices receiving packets
in the system, particularly in times of high load or overload.

And equally importantly, it allows you to run the majority of your
interrupt handler in software IRQ context. This allows not only your
locking to be simpler, but it also allows things like oprofile to
monitor almost your entire IRQ processing path even with just timer
interrupt based oprofile profiling.

I see you moving TX reclaim into tasklets and stuff. I've vehemently
against that because you wouldn't need it in order to move TX
processing into software interrupts if you did it all in NAPI
->poll().

2006-08-18 23:29:48

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, Aug 18, 2006 at 03:51:16PM -0700, David Miller wrote:
> I see you moving TX reclaim into tasklets and stuff. I've vehemently
> against that because you wouldn't need it in order to move TX
> processing into software interrupts if you did it all in NAPI
> ->poll().

I don't understand what you are saying. If I call the transmit
queue cleanup code from the poll() routine, nothing hapens,
because the kernel does not call the poll() routine often
enough. I've stated this several times.

--linas

2006-08-18 23:45:51

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, Aug 18, 2006 at 06:29:42PM -0500, linas wrote:
>
> I don't understand what you are saying. If I call the transmit
> queue cleanup code from the poll() routine, nothing hapens,
> because the kernel does not call the poll() routine often
> enough. I've stated this several times.

OK, Arnd gave me a clue stick. I need to call the (misnamed)
netif_rx_schedule() from the tx interrupt in order to get
this to work. That makes sense, and its easy, I'll send the
revised patch.. well, not tonight, but shortly.

--linas

2006-08-19 04:32:13

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, 2006-08-18 at 15:51 -0700, David Miller wrote:
> From: [email protected] (Linas Vepstas)
> Date: Fri, 18 Aug 2006 17:46:18 -0500
>
> > > We're not saying to use the RX interrupt as the trigger for
> > > RX and TX work. Rather, either of RX or TX interrupt will
> > > schedule the NAPI poll.
> >
> > And, for a lark, this is exactly what I did. Just to see.
> > Because there are so few ack packets, there are very few
> > RX interrupts -- not enough to get NAPI to actually keep
> > the device busy.
>
> You're misreading me. TX interrupts are intended to be "enabled" and
> trigger NAPI polls. TX IRQ enabled, enabled :-)

Maybe be because you actually typed "disabled" in your previous
message ? :)

>> The idea is to use NAPI polling with TX interrupts disabled.

> If you want to eliminate them if the kernel keeps hopping into
> the ->hard_start_xmit() via hw interrupt mitigation or whatever,
> that's fine. But if you do need to do TX interrupt processing,
> do it in NAPI ->poll().

Well, we do need to harvest descriptors of course, though I suppose that
can be done in hard_xmit as well. I'm not sure if there is any real
benefit in batching those.

> > I'm somewhat disoriened from this conversation. Its presumably
> > clear that low-watermark mechanisms are superior to NAPI.
> > >From what I gather, NAPI was invented to deal with cheap
> > or low-function hardware; it adds nothing to this particular
> > situation. Why are we talking about this?
>
> NAPI is meant to give fairness to all devices receiving packets
> in the system, particularly in times of high load or overload.
>
> And equally importantly, it allows you to run the majority of your
> interrupt handler in software IRQ context.

That is the most important point imho for the specific case of spidernet
on cell.

> This allows not only your
> locking to be simpler, but it also allows things like oprofile to
> monitor almost your entire IRQ processing path even with just timer
> interrupt based oprofile profiling.
>
> I see you moving TX reclaim into tasklets and stuff. I've vehemently
> against that because you wouldn't need it in order to move TX
> processing into software interrupts if you did it all in NAPI
> ->poll().

Ben.

2006-08-19 04:34:15

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Fri, 2006-08-18 at 18:45 -0500, Linas Vepstas wrote:
> On Fri, Aug 18, 2006 at 06:29:42PM -0500, linas wrote:
> >
> > I don't understand what you are saying. If I call the transmit
> > queue cleanup code from the poll() routine, nothing hapens,
> > because the kernel does not call the poll() routine often
> > enough. I've stated this several times.
>
> OK, Arnd gave me a clue stick. I need to call the (misnamed)
> netif_rx_schedule() from the tx interrupt in order to get
> this to work. That makes sense, and its easy, I'll send the
> revised patch.. well, not tonight, but shortly.

You might not want to call it all the time though... You need some
interrupt mitigation and thus a timer that calls netif_rx_schedule()
might be of some use still...

Ben.


2006-08-22 00:13:15

by linas

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

On Sat, Aug 19, 2006 at 02:33:42PM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2006-08-18 at 18:45 -0500, Linas Vepstas wrote:
> > On Fri, Aug 18, 2006 at 06:29:42PM -0500, linas wrote:
> > >
> > > I don't understand what you are saying. If I call the transmit
> > > queue cleanup code from the poll() routine, nothing hapens,
> > > because the kernel does not call the poll() routine often
> > > enough. I've stated this several times.
> >
> > OK, Arnd gave me a clue stick. I need to call the (misnamed)
> > netif_rx_schedule() from the tx interrupt in order to get
> > this to work. That makes sense, and its easy, I'll send the
> > revised patch.. well, not tonight, but shortly.
>
> You might not want to call it all the time though... You need some
> interrupt mitigation and thus a timer that calls netif_rx_schedule()
> might be of some use still...

Well, again, the whole point of a low-watermark interrupt is to
get zero of them when the system is working correctly; they're
self-mitigating by design.

-------------
Anyway, I tried the suggestion, but am getting less-than-ideal
results.

To recap: my original patch did this:

spider_interrupt_handler(struct whatever *) {
...
if (tx_interrupt)
schedule_work (tx_cleanup_handler)
}

which David Miller objected to. Once I understood the why
(sorry for not getting it right away), I then replaced the
above with the below, which is what I think everyone wanted:


spider_interrupt_handler(struct whatever *) {
...
if (tx_interrupt)
netif_rx_schedule(netdev);
}

spidernet_poll(stuct whatever *) {
tx_cleanup_handler(txring);
// rx_stuff too ...
}

I was expecting this to be a no-op from the performance
point of view. Instead, I get a fairly dramatic (11%) slowdown:
the first patch runs in the 785-805 Mbits/sec range, while
the second patch runs in the 705-715 Mbits/sec range.
I am surprised, ad don't understand why this would be so.

For the record, the alternate patch is below.

----

Index: linux-2.6.18-rc2/drivers/net/spider_net.c
===================================================================
--- linux-2.6.18-rc2.orig/drivers/net/spider_net.c 2006-08-21 16:59:33.000000000 -0500
+++ linux-2.6.18-rc2/drivers/net/spider_net.c 2006-08-21 17:15:28.000000000 -0500
@@ -1087,6 +1090,8 @@ spider_net_poll(struct net_device *netde
int packets_to_do, packets_done = 0;
int no_more_packets = 0;

+ spider_net_cleanup_tx_ring(card);
+
packets_to_do = min(*budget, netdev->quota);

while (packets_to_do) {
@@ -1495,16 +1500,16 @@ spider_net_interrupt(int irq, void *ptr,
if (!status_reg)
return IRQ_NONE;

- if (status_reg & SPIDER_NET_RXINT ) {
+ if (status_reg & SPIDER_NET_RXINT) {
spider_net_rx_irq_off(card);
netif_rx_schedule(netdev);
}
- if (status_reg & SPIDER_NET_TXINT ) {
- spider_net_cleanup_tx_ring(card);
- netif_wake_queue(netdev);
- }

- if (status_reg & SPIDER_NET_ERRINT )
+ /* Call rx_schedule from the tx interrupt, so that NAPI poll runs. */
+ if (status_reg & SPIDER_NET_TXINT)
+ netif_rx_schedule(netdev);
+
+ if (status_reg & SPIDER_NET_ERRINT)
spider_net_handle_error_irq(card, status_reg);

/* clear interrupt sources */




2006-08-22 00:30:04

by David Miller

[permalink] [raw]
Subject: Re: [PATCH 2/4]: powerpc/cell spidernet low watermark patch.

From: [email protected] (Linas Vepstas)
Date: Mon, 21 Aug 2006 19:13:11 -0500

> @@ -1495,16 +1500,16 @@ spider_net_interrupt(int irq, void *ptr,
> if (!status_reg)
> return IRQ_NONE;
>
> - if (status_reg & SPIDER_NET_RXINT ) {
> + if (status_reg & SPIDER_NET_RXINT) {
> spider_net_rx_irq_off(card);
> netif_rx_schedule(netdev);
> }
> - if (status_reg & SPIDER_NET_TXINT ) {
> - spider_net_cleanup_tx_ring(card);
> - netif_wake_queue(netdev);
> - }
>
> - if (status_reg & SPIDER_NET_ERRINT )
> + /* Call rx_schedule from the tx interrupt, so that NAPI poll runs. */
> + if (status_reg & SPIDER_NET_TXINT)
> + netif_rx_schedule(netdev);
> +
> + if (status_reg & SPIDER_NET_ERRINT)

This should be:

if ((status_reg & (SPIDET_NET_RXINT | SPIDET_NET_TXINT))) {
spider_net_rx_and_tx_irq_off(card);
netif_rx_schedule();
}