2005-10-26 16:46:53

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 0/5] ibmveth resending various fixes

Resending various ibmveth fixes due to wordwrapping by email client.

--
Santiago A. Leon
Power Linux Development
IBM Linux Technology Center


2005-10-26 16:46:58

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 1/5] ibmveth fix bonding

This patch updates dev->trans_start and dev->last_rx so that the ibmveth
driver can be used with the ARP monitor in the bonding driver.

Signed-off-by: Santiago Leon <[email protected]>
---

drivers/net/ibmveth.c | 2 ++
1 files changed, 2 insertions(+)

--- a/drivers/net/ibmveth.c 2005-10-11 12:56:24.000000000 -0500
+++ b/drivers/net/ibmveth.c 2005-10-11 13:52:45.000000000 -0500
@@ -725,6 +725,7 @@
} else {
adapter->stats.tx_packets++;
adapter->stats.tx_bytes += skb->len;
+ netdev->trans_start = jiffies;
}

do {
@@ -776,6 +777,7 @@
adapter->stats.rx_packets++;
adapter->stats.rx_bytes += length;
frames_processed++;
+ netdev->last_rx = jiffies;
}
} else {
more_work = 0;

2005-10-26 16:47:29

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 3/5] ibmveth fix buffer replenishing

This patch removes the allocation of RX skb's buffers from a workqueue
to be called directly at RX processing time. This change was suggested
by Dave Miller when the driver was starving the RX buffers and
deadlocking under heavy traffic:


> Allocating RX SKBs via tasklet is, IMHO, the worst way to
> do it. It is no surprise that there are starvation cases.
>
> If tasklets or work queues get delayed in any way, you lose,
> and it's very easy for a card to catch up with the driver RX'ing
> packets very fast, no matter how aggressive you make the
> replenishing. By the time you detect that you need to be
> "more aggressive" it is already too late.
> The only pseudo-reliable way is to allocate at RX processing time.
>

Signed-off-by: Santiago Leon <[email protected]>
---

drivers/net/ibmveth.c | 48 ++++++++----------------------------------------
drivers/net/ibmveth.h | 4 ----
2 files changed, 8 insertions(+), 44 deletions(-)

--- a/drivers/net/ibmveth.c 2005-10-17 12:04:58.000000000 -0500
+++ b/drivers/net/ibmveth.c 2005-10-17 12:23:22.000000000 -0500
@@ -96,7 +96,6 @@
static void ibmveth_proc_register_adapter(struct ibmveth_adapter *adapter);
static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter);
static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
-static inline void ibmveth_schedule_replenishing(struct ibmveth_adapter*);
static inline void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter);

#ifdef CONFIG_PROC_FS
@@ -257,29 +256,7 @@
atomic_add(buffers_added, &(pool->available));
}

-/* check if replenishing is needed. */
-static inline int ibmveth_is_replenishing_needed(struct ibmveth_adapter *adapter)
-{
- int i;
-
- for(i = 0; i < IbmVethNumBufferPools; i++)
- if(adapter->rx_buff_pool[i].active &&
- (atomic_read(&adapter->rx_buff_pool[i].available) <
- adapter->rx_buff_pool[i].threshold))
- return 1;
- return 0;
-}
-
-/* kick the replenish tasklet if we need replenishing and it isn't already running */
-static inline void ibmveth_schedule_replenishing(struct ibmveth_adapter *adapter)
-{
- if(ibmveth_is_replenishing_needed(adapter) &&
- (atomic_dec_if_positive(&adapter->not_replenishing) == 0)) {
- schedule_work(&adapter->replenish_task);
- }
-}
-
-/* replenish tasklet routine */
+/* replenish routine */
static void ibmveth_replenish_task(struct ibmveth_adapter *adapter)
{
int i;
@@ -292,10 +269,6 @@
&adapter->rx_buff_pool[i]);

adapter->rx_no_buffer = *(u64*)(((char*)adapter->buffer_list_addr) + 4096 - 8);
-
- atomic_inc(&adapter->not_replenishing);
-
- ibmveth_schedule_replenishing(adapter);
}

/* empty and free ana buffer pool - also used to do cleanup in error paths */
@@ -563,10 +536,10 @@
return rc;
}

- netif_start_queue(netdev);
+ ibmveth_debug_printk("initial replenish cycle\n");
+ ibmveth_replenish_task(adapter);

- ibmveth_debug_printk("scheduling initial replenish cycle\n");
- ibmveth_schedule_replenishing(adapter);
+ netif_start_queue(netdev);

ibmveth_debug_printk("open complete\n");

@@ -584,9 +557,6 @@

free_irq(netdev->irq, netdev);

- cancel_delayed_work(&adapter->replenish_task);
- flush_scheduled_work();
-
do {
lpar_rc = h_free_logical_lan(adapter->vdev->unit_address);
} while (H_isLongBusy(lpar_rc) || (lpar_rc == H_Busy));
@@ -795,7 +765,7 @@
}
} while(more_work && (frames_processed < max_frames_to_process));

- ibmveth_schedule_replenishing(adapter);
+ ibmveth_replenish_task(adapter);

if(more_work) {
/* more work to do - return that we are not done yet */
@@ -931,8 +901,10 @@

}

+ /* kick the interrupt handler so that the new buffer pools get
+ replenished or deallocated */
+ ibmveth_interrupt(dev->irq, dev, NULL);

- ibmveth_schedule_replenishing(adapter);
dev->mtu = new_mtu;
return 0;
}
@@ -1017,14 +989,10 @@

ibmveth_debug_printk("adapter @ 0x%p\n", adapter);

- INIT_WORK(&adapter->replenish_task, (void*)ibmveth_replenish_task, (void*)adapter);
-
adapter->buffer_list_dma = DMA_ERROR_CODE;
adapter->filter_list_dma = DMA_ERROR_CODE;
adapter->rx_queue.queue_dma = DMA_ERROR_CODE;

- atomic_set(&adapter->not_replenishing, 1);
-
ibmveth_debug_printk("registering netdev...\n");

rc = register_netdev(netdev);
diff -urN a/drivers/net/ibmveth.h b/drivers/net/ibmveth.h
--- a/drivers/net/ibmveth.h 2005-10-17 12:04:58.000000000 -0500
+++ b/drivers/net/ibmveth.h 2005-10-17 12:23:22.000000000 -0500
@@ -118,10 +118,6 @@
dma_addr_t filter_list_dma;
struct ibmveth_buff_pool rx_buff_pool[IbmVethNumBufferPools];
struct ibmveth_rx_q rx_queue;
- atomic_t not_replenishing;
-
- /* helper tasks */
- struct work_struct replenish_task;

/* adapter specific stats */
u64 replenish_task_cycles;

2005-10-26 16:48:04

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 2/5] ibmveth fix buffer pool management

This patch changes the way the ibmveth driver handles the receive
buffers. The old code mallocs and maps all the buffers in the pools
regardless of MTU size and it also limits the number of buffer pools to
three. This patch makes the driver malloc and map the buffers necessary
to support the current MTU. It also changes the hardcoded names of the
buffer pool number, size, and elements to arrays to make it easier to
change (with the hope of making them runtime parameters in the future).

Signed-off-by: Santiago Leon <[email protected]>
---

drivers/net/ibmveth.c | 102 ++++++++++++++++++++++++++++++++++++--------------
drivers/net/ibmveth.h | 18 +++++---
2 files changed, 85 insertions(+), 35 deletions(-)

--- a/drivers/net/ibmveth.c 2005-10-17 11:59:57.000000000 -0500
+++ b/drivers/net/ibmveth.c 2005-10-17 12:02:03.000000000 -0500
@@ -97,6 +97,7 @@
static void ibmveth_proc_unregister_adapter(struct ibmveth_adapter *adapter);
static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance, struct pt_regs *regs);
static inline void ibmveth_schedule_replenishing(struct ibmveth_adapter*);
+static inline void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter);

#ifdef CONFIG_PROC_FS
#define IBMVETH_PROC_DIR "net/ibmveth"
@@ -181,6 +182,7 @@
atomic_set(&pool->available, 0);
pool->producer_index = 0;
pool->consumer_index = 0;
+ pool->active = 0;

return 0;
}
@@ -258,9 +260,14 @@
/* check if replenishing is needed. */
static inline int ibmveth_is_replenishing_needed(struct ibmveth_adapter *adapter)
{
- return ((atomic_read(&adapter->rx_buff_pool[0].available) < adapter->rx_buff_pool[0].threshold) ||
- (atomic_read(&adapter->rx_buff_pool[1].available) < adapter->rx_buff_pool[1].threshold) ||
- (atomic_read(&adapter->rx_buff_pool[2].available) < adapter->rx_buff_pool[2].threshold));
+ int i;
+
+ for(i = 0; i < IbmVethNumBufferPools; i++)
+ if(adapter->rx_buff_pool[i].active &&
+ (atomic_read(&adapter->rx_buff_pool[i].available) <
+ adapter->rx_buff_pool[i].threshold))
+ return 1;
+ return 0;
}

/* kick the replenish tasklet if we need replenishing and it isn't already running */
@@ -275,11 +282,14 @@
/* replenish tasklet routine */
static void ibmveth_replenish_task(struct ibmveth_adapter *adapter)
{
+ int i;
+
adapter->replenish_task_cycles++;

- ibmveth_replenish_buffer_pool(adapter, &adapter->rx_buff_pool[0]);
- ibmveth_replenish_buffer_pool(adapter, &adapter->rx_buff_pool[1]);
- ibmveth_replenish_buffer_pool(adapter, &adapter->rx_buff_pool[2]);
+ for(i = 0; i < IbmVethNumBufferPools; i++)
+ if(adapter->rx_buff_pool[i].active)
+ ibmveth_replenish_buffer_pool(adapter,
+ &adapter->rx_buff_pool[i]);

adapter->rx_no_buffer = *(u64*)(((char*)adapter->buffer_list_addr) + 4096 - 8);

@@ -321,6 +331,7 @@
kfree(pool->skbuff);
pool->skbuff = NULL;
}
+ pool->active = 0;
}

/* remove a buffer from a pool */
@@ -379,6 +390,12 @@
ibmveth_assert(pool < IbmVethNumBufferPools);
ibmveth_assert(index < adapter->rx_buff_pool[pool].size);

+ if(!adapter->rx_buff_pool[pool].active) {
+ ibmveth_rxq_harvest_buffer(adapter);
+ ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[pool]);
+ return;
+ }
+
desc.desc = 0;
desc.fields.valid = 1;
desc.fields.length = adapter->rx_buff_pool[pool].buff_size;
@@ -409,6 +426,8 @@

static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
{
+ int i;
+
if(adapter->buffer_list_addr != NULL) {
if(!dma_mapping_error(adapter->buffer_list_dma)) {
dma_unmap_single(&adapter->vdev->dev,
@@ -443,26 +462,24 @@
adapter->rx_queue.queue_addr = NULL;
}

- ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[0]);
- ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[1]);
- ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[2]);
+ for(i = 0; i<IbmVethNumBufferPools; i++)
+ ibmveth_free_buffer_pool(adapter, &adapter->rx_buff_pool[i]);
}

static int ibmveth_open(struct net_device *netdev)
{
struct ibmveth_adapter *adapter = netdev->priv;
u64 mac_address = 0;
- int rxq_entries;
+ int rxq_entries = 1;
unsigned long lpar_rc;
int rc;
union ibmveth_buf_desc rxq_desc;
+ int i;

ibmveth_debug_printk("open starting\n");

- rxq_entries =
- adapter->rx_buff_pool[0].size +
- adapter->rx_buff_pool[1].size +
- adapter->rx_buff_pool[2].size + 1;
+ for(i = 0; i<IbmVethNumBufferPools; i++)
+ rxq_entries += adapter->rx_buff_pool[i].size;

adapter->buffer_list_addr = (void*) get_zeroed_page(GFP_KERNEL);
adapter->filter_list_addr = (void*) get_zeroed_page(GFP_KERNEL);
@@ -502,14 +519,8 @@
adapter->rx_queue.num_slots = rxq_entries;
adapter->rx_queue.toggle = 1;

- if(ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[0]) ||
- ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[1]) ||
- ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[2]))
- {
- ibmveth_error_printk("unable to allocate buffer pools\n");
- ibmveth_cleanup(adapter);
- return -ENOMEM;
- }
+ /* call change_mtu to init the buffer pools based in initial mtu */
+ ibmveth_change_mtu(netdev, netdev->mtu);

memcpy(&mac_address, netdev->dev_addr, netdev->addr_len);
mac_address = mac_address >> 16;
@@ -885,17 +896,52 @@

static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
{
- if ((new_mtu < 68) || (new_mtu > (1<<20)))
+ struct ibmveth_adapter *adapter = dev->priv;
+ int i;
+ int prev_smaller = 1;
+
+ if ((new_mtu < 68) ||
+ (new_mtu > (pool_size[IbmVethNumBufferPools-1]) - IBMVETH_BUFF_OH))
return -EINVAL;
+
+ for(i = 0; i<IbmVethNumBufferPools; i++) {
+ int activate = 0;
+ if (new_mtu > (pool_size[i] - IBMVETH_BUFF_OH)) {
+ activate = 1;
+ prev_smaller= 1;
+ } else {
+ if (prev_smaller)
+ activate = 1;
+ prev_smaller= 0;
+ }
+
+ if (activate && !adapter->rx_buff_pool[i].active) {
+ struct ibmveth_buff_pool *pool =
+ &adapter->rx_buff_pool[i];
+ if(ibmveth_alloc_buffer_pool(pool)) {
+ ibmveth_error_printk("unable to alloc pool\n");
+ return -ENOMEM;
+ }
+ adapter->rx_buff_pool[i].active = 1;
+ } else if (!activate && adapter->rx_buff_pool[i].active) {
+ adapter->rx_buff_pool[i].active = 0;
+ h_free_logical_lan_buffer(adapter->vdev->unit_address,
+ (u64)pool_size[i]);
+ }
+
+ }
+
+
+ ibmveth_schedule_replenishing(adapter);
dev->mtu = new_mtu;
return 0;
}

static int __devinit ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
{
- int rc;
+ int rc, i;
struct net_device *netdev;
- struct ibmveth_adapter *adapter;
+ struct ibmveth_adapter *adapter = NULL;

unsigned char *mac_addr_p;
unsigned int *mcastFilterSize_p;
@@ -965,9 +1011,9 @@

memcpy(&netdev->dev_addr, &adapter->mac_addr, netdev->addr_len);

- ibmveth_init_buffer_pool(&adapter->rx_buff_pool[0], 0, IbmVethPool0DftCnt, IbmVethPool0DftSize);
- ibmveth_init_buffer_pool(&adapter->rx_buff_pool[1], 1, IbmVethPool1DftCnt, IbmVethPool1DftSize);
- ibmveth_init_buffer_pool(&adapter->rx_buff_pool[2], 2, IbmVethPool2DftCnt, IbmVethPool2DftSize);
+ for(i = 0; i<IbmVethNumBufferPools; i++)
+ ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i,
+ pool_count[i], pool_size[i]);

ibmveth_debug_printk("adapter @ 0x%p\n", adapter);

diff -urN a/drivers/net/ibmveth.h b/drivers/net/ibmveth.h
--- a/drivers/net/ibmveth.h 2005-06-17 14:48:29.000000000 -0500
+++ b/drivers/net/ibmveth.h 2005-10-17 12:00:25.000000000 -0500
@@ -49,6 +49,7 @@
#define H_SEND_LOGICAL_LAN 0x120
#define H_MULTICAST_CTRL 0x130
#define H_CHANGE_LOGICAL_LAN_MAC 0x14C
+#define H_FREE_LOGICAL_LAN_BUFFER 0x1D4

/* hcall macros */
#define h_register_logical_lan(ua, buflst, rxq, fltlst, mac) \
@@ -69,13 +70,15 @@
#define h_change_logical_lan_mac(ua, mac) \
plpar_hcall_norets(H_CHANGE_LOGICAL_LAN_MAC, ua, mac)

-#define IbmVethNumBufferPools 3
-#define IbmVethPool0DftSize (1024 * 2)
-#define IbmVethPool1DftSize (1024 * 4)
-#define IbmVethPool2DftSize (1024 * 10)
-#define IbmVethPool0DftCnt 256
-#define IbmVethPool1DftCnt 256
-#define IbmVethPool2DftCnt 256
+#define h_free_logical_lan_buffer(ua, bufsize) \
+ plpar_hcall_norets(H_FREE_LOGICAL_LAN_BUFFER, ua, bufsize)
+
+#define IbmVethNumBufferPools 5
+#define IBMVETH_BUFF_OH 22 /* Overhead: 14 ethernet header + 8 opaque handle */
+
+/* pool_size should be sorted */
+static int pool_size[] = { 512, 1024 * 2, 1024 * 16, 1024 * 32, 1024 * 64 };
+static int pool_count[] = { 256, 768, 256, 256, 256 };

#define IBM_VETH_INVALID_MAP ((u16)0xffff)

@@ -90,6 +93,7 @@
u16 *free_map;
dma_addr_t *dma_addr;
struct sk_buff **skbuff;
+ int active;
};

struct ibmveth_rx_q {

2005-10-26 16:47:26

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 4/5] ibmveth lockless TX

This patch adds the lockless TX feature to the ibmveth driver. The
hypervisor has its own locking so the only change that is necessary is
to protect the statistics counters.

Signed-off-by: Santiago Leon <[email protected]>
---

drivers/net/ibmveth.c | 44 +++++++++++++++++++++++++++++---------------
drivers/net/ibmveth.h | 1 +
2 files changed, 30 insertions(+), 15 deletions(-)

--- a/drivers/net/ibmveth.c 2005-10-17 12:37:06.000000000 -0500
+++ b/drivers/net/ibmveth.c 2005-10-18 11:58:56.000000000 -0500
@@ -621,12 +621,18 @@
unsigned long lpar_rc;
int nfrags = 0, curfrag;
unsigned long correlator;
+ unsigned long flags;
unsigned int retry_count;
+ unsigned int tx_dropped = 0;
+ unsigned int tx_bytes = 0;
+ unsigned int tx_packets = 0;
+ unsigned int tx_send_failed = 0;
+ unsigned int tx_map_failed = 0;
+

if ((skb_shinfo(skb)->nr_frags + 1) > IbmVethMaxSendFrags) {
- adapter->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return 0;
+ tx_dropped++;
+ goto out;
}

memset(&desc, 0, sizeof(desc));
@@ -645,10 +651,9 @@

if(dma_mapping_error(desc[0].fields.address)) {
ibmveth_error_printk("tx: unable to map initial fragment\n");
- adapter->tx_map_failed++;
- adapter->stats.tx_dropped++;
- dev_kfree_skb(skb);
- return 0;
+ tx_map_failed++;
+ tx_dropped++;
+ goto out;
}

curfrag = nfrags;
@@ -665,8 +670,8 @@

if(dma_mapping_error(desc[curfrag+1].fields.address)) {
ibmveth_error_printk("tx: unable to map fragment %d\n", curfrag);
- adapter->tx_map_failed++;
- adapter->stats.tx_dropped++;
+ tx_map_failed++;
+ tx_dropped++;
/* Free all the mappings we just created */
while(curfrag < nfrags) {
dma_unmap_single(&adapter->vdev->dev,
@@ -675,8 +680,7 @@
DMA_TO_DEVICE);
curfrag++;
}
- dev_kfree_skb(skb);
- return 0;
+ goto out;
}
}

@@ -701,11 +705,11 @@
ibmveth_error_printk("tx: desc[%i] valid=%d, len=%d, address=0x%d\n", i,
desc[i].fields.valid, desc[i].fields.length, desc[i].fields.address);
}
- adapter->tx_send_failed++;
- adapter->stats.tx_dropped++;
+ tx_send_failed++;
+ tx_dropped++;
} else {
- adapter->stats.tx_packets++;
- adapter->stats.tx_bytes += skb->len;
+ tx_packets++;
+ tx_bytes += skb->len;
netdev->trans_start = jiffies;
}

@@ -715,6 +719,14 @@
desc[nfrags].fields.length, DMA_TO_DEVICE);
} while(--nfrags >= 0);

+out: spin_lock_irqsave(&adapter->stats_lock, flags);
+ adapter->stats.tx_dropped += tx_dropped;
+ adapter->stats.tx_bytes += tx_bytes;
+ adapter->stats.tx_packets += tx_packets;
+ adapter->tx_send_failed += tx_send_failed;
+ adapter->tx_map_failed += tx_map_failed;
+ spin_unlock_irqrestore(&adapter->stats_lock, flags);
+
dev_kfree_skb(skb);
return 0;
}
@@ -980,6 +992,8 @@
netdev->ethtool_ops = &netdev_ethtool_ops;
netdev->change_mtu = ibmveth_change_mtu;
SET_NETDEV_DEV(netdev, &dev->dev);
+ netdev->features |= NETIF_F_LLTX;
+ spin_lock_init(&adapter->stats_lock);

memcpy(&netdev->dev_addr, &adapter->mac_addr, netdev->addr_len);

diff -urN a/drivers/net/ibmveth.h b/drivers/net/ibmveth.h
--- a/drivers/net/ibmveth.h 2005-10-17 12:35:13.000000000 -0500
+++ b/drivers/net/ibmveth.h 2005-10-18 11:58:56.000000000 -0500
@@ -131,6 +131,7 @@
u64 tx_linearize_failed;
u64 tx_map_failed;
u64 tx_send_failed;
+ spinlock_t stats_lock;
};

struct ibmveth_buf_desc_fields {

2005-10-26 16:47:42

by Santiago Leon

[permalink] [raw]
Subject: [PATCH 5/5] ibmveth fix failed addbuf

This patch fixes a bug that happens when the hypervisor can't add a
buffer. The old code wrote IBM_VETH_INVALID_MAP into the free_map
array, so next time the index was used, a ibmveth_assert() caught it and
called BUG(). The patch writes the right value into the free_map array
so that the index can be reused.

Signed-off-by: Santiago Leon <[email protected]>
---

drivers/net/ibmveth.c | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/net/ibmveth.c 2005-10-17 12:35:13.000000000 -0500
+++ b/drivers/net/ibmveth.c 2005-10-17 12:36:13.000000000 -0500
@@ -237,7 +237,7 @@
lpar_rc = h_add_logical_lan_buffer(adapter->vdev->unit_address, desc.desc);

if(lpar_rc != H_Success) {
- pool->free_map[free_index] = IBM_VETH_INVALID_MAP;
+ pool->free_map[free_index] = index;
pool->skbuff[index] = NULL;
pool->consumer_index--;
dma_unmap_single(&adapter->vdev->dev,

2005-10-28 20:08:04

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 1/5] ibmveth fix bonding

Santiago Leon wrote:
> This patch updates dev->trans_start and dev->last_rx so that the ibmveth
> driver can be used with the ARP monitor in the bonding driver.
>
> Signed-off-by: Santiago Leon <[email protected]>

applied 1-5