2010-09-28 15:07:53

by Ido Yariv

[permalink] [raw]
Subject: [PATCH 0/2] wl1271: Packet aggregation optimizations

The WL1271 HW supports sending and receiving multiple packets within
a single transaction. This can substantially reduce CPU usage and
increase network throughput.

The following patches add support for RX and TX packet aggregation.
An opportunistic approach was taken - all packets that are currently
available for sending/receiving are aggregated and transferred
in a single transaction.

The patches were tested on a Zoom2 platform (SDIO only).

Ido Yariv (2):
wl1271: Support firmware RX packet aggregation
wl1271: Support firmware TX packet aggregation

drivers/net/wireless/wl12xx/wl1271.h | 5 ++
drivers/net/wireless/wl12xx/wl1271_main.c | 15 ++++-
drivers/net/wireless/wl12xx/wl1271_rx.c | 63 ++++++++++++++-----
drivers/net/wireless/wl12xx/wl1271_tx.c | 99 +++++++++++------------------
4 files changed, 103 insertions(+), 79 deletions(-)



2010-09-29 09:50:36

by Juuso Oikarinen

[permalink] [raw]
Subject: Re: [PATCH 0/2] wl1271: Packet aggregation optimizations

On Wed, 2010-09-29 at 08:28 +0200, ext Juuso Oikarinen wrote:
> On Tue, 2010-09-28 at 17:07 +0200, ext Ido Yariv wrote:
> > The WL1271 HW supports sending and receiving multiple packets within
> > a single transaction. This can substantially reduce CPU usage and
> > increase network throughput.
> >
> > The following patches add support for RX and TX packet aggregation.
> > An opportunistic approach was taken - all packets that are currently
> > available for sending/receiving are aggregated and transferred
> > in a single transaction.
> >
> > The patches were tested on a Zoom2 platform (SDIO only).
> >
> > Ido Yariv (2):
> > wl1271: Support firmware RX packet aggregation
> > wl1271: Support firmware TX packet aggregation
> >
>
> Thanks for these patches!
>
> We are testing them now.

I did some desktop-testing for these patches. For SDIO, I can see some
definite improvements in CPU usage during data transfers with these
patches despite the added memcpy operations.

However, the SPI is totally broken with these patches. I did some
debugging and noticed, that once the SPI transfers exceed 4500 bytes or
so, the hardware crashes.

Excerpt from syslog. I repeated this several times, and each time the
hardware went unresponsive after the larger transfer, like here:

[ 44.847198] TX length:
1552
[ 44.876983] RX length:
272
[ 46.265014] TX length:
1552
[ 46.270416] RX length:
120
[ 46.270629] TX length:
3104
[ 46.274963] RX length:
120
[ 46.275177] TX length:
4656
[ 46.285339] wl1271: ERROR SPI read busy-word
timeout!

-Juuso


2010-09-28 15:07:55

by Ido Yariv

[permalink] [raw]
Subject: [PATCH 1/2] wl1271: Support firmware RX packet aggregation

Instead of retrieving one packet at a time from the firmware, try to
retrieve all available packets at once.
This optimization decreases the number of transactions, which saves
CPU cycles and increases network throughput.

Signed-off-by: Ido Yariv <[email protected]>
---
drivers/net/wireless/wl12xx/wl1271.h | 5 ++
drivers/net/wireless/wl12xx/wl1271_main.c | 15 ++++++-
drivers/net/wireless/wl12xx/wl1271_rx.c | 63 +++++++++++++++++++++-------
3 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/wl1271.h b/drivers/net/wireless/wl12xx/wl1271.h
index 779b130..8a4cd76 100644
--- a/drivers/net/wireless/wl12xx/wl1271.h
+++ b/drivers/net/wireless/wl12xx/wl1271.h
@@ -130,6 +130,8 @@ enum {

#define ACX_TX_DESCRIPTORS 32

+#define WL1271_AGGR_BUFFER_SIZE (4 * PAGE_SIZE)
+
enum wl1271_state {
WL1271_STATE_OFF,
WL1271_STATE_ON,
@@ -408,6 +410,9 @@ struct wl1271 {
/* Rx memory pool address */
struct wl1271_rx_mem_pool_addr rx_mem_pool_addr;

+ /* Intermediate buffer, used for packet aggregation */
+ u8 *aggr_buf;
+
/* The target interrupt mask */
struct work_struct irq_work;

diff --git a/drivers/net/wireless/wl12xx/wl1271_main.c b/drivers/net/wireless/wl12xx/wl1271_main.c
index cb18f22..8071da1 100644
--- a/drivers/net/wireless/wl12xx/wl1271_main.c
+++ b/drivers/net/wireless/wl12xx/wl1271_main.c
@@ -2459,6 +2459,7 @@ struct ieee80211_hw *wl1271_alloc_hw(void)
struct platform_device *plat_dev = NULL;
struct wl1271 *wl;
int i, ret;
+ unsigned int order;

hw = ieee80211_alloc_hw(sizeof(*wl), &wl1271_ops);
if (!hw) {
@@ -2517,11 +2518,18 @@ struct ieee80211_hw *wl1271_alloc_hw(void)

wl1271_debugfs_init(wl);

+ order = get_order(WL1271_AGGR_BUFFER_SIZE);
+ wl->aggr_buf = (u8 *)__get_free_pages(GFP_KERNEL, order);
+ if (!wl->aggr_buf) {
+ ret = -ENOMEM;
+ goto err_hw;
+ }
+
/* Register platform device */
ret = platform_device_register(wl->plat_dev);
if (ret) {
wl1271_error("couldn't register platform device");
- goto err_hw;
+ goto err_aggr;
}
dev_set_drvdata(&wl->plat_dev->dev, wl);

@@ -2547,6 +2555,9 @@ err_bt_coex_state:
err_platform:
platform_device_unregister(wl->plat_dev);

+err_aggr:
+ free_pages((unsigned long)wl->aggr_buf, order);
+
err_hw:
wl1271_debugfs_exit(wl);
kfree(plat_dev);
@@ -2563,6 +2574,8 @@ EXPORT_SYMBOL_GPL(wl1271_alloc_hw);
int wl1271_free_hw(struct wl1271 *wl)
{
platform_device_unregister(wl->plat_dev);
+ free_pages((unsigned long)wl->aggr_buf,
+ get_order(WL1271_AGGR_BUFFER_SIZE));
kfree(wl->plat_dev);

wl1271_debugfs_exit(wl);
diff --git a/drivers/net/wireless/wl12xx/wl1271_rx.c b/drivers/net/wireless/wl12xx/wl1271_rx.c
index 94da5dd..bea133b 100644
--- a/drivers/net/wireless/wl12xx/wl1271_rx.c
+++ b/drivers/net/wireless/wl12xx/wl1271_rx.c
@@ -74,7 +74,7 @@ static void wl1271_rx_status(struct wl1271 *wl,
}
}

-static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
+static int wl1271_rx_handle_data(struct wl1271 *wl, u8 *data, u32 length)
{
struct wl1271_rx_descriptor *desc;
struct sk_buff *skb;
@@ -87,16 +87,16 @@ static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
* workaround this by not retrieving them at all.
*/
if (unlikely(wl->state == WL1271_STATE_PLT))
- return;
+ return -EINVAL;

skb = __dev_alloc_skb(length, GFP_KERNEL);
if (!skb) {
wl1271_error("Couldn't allocate RX frame");
- return;
+ return -ENOMEM;
}

buf = skb_put(skb, length);
- wl1271_read(wl, WL1271_SLV_MEM_DATA, buf, length, true);
+ memcpy(buf, data, length);

/* the data read starts with the descriptor */
desc = (struct wl1271_rx_descriptor *) buf;
@@ -116,6 +116,8 @@ static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
skb_trim(skb, skb->len - desc->pad_len);

ieee80211_rx_ni(wl->hw, skb);
+
+ return 0;
}

void wl1271_rx(struct wl1271 *wl, struct wl1271_fw_status *status)
@@ -124,31 +126,60 @@ void wl1271_rx(struct wl1271 *wl, struct wl1271_fw_status *status)
u32 buf_size;
u32 fw_rx_counter = status->fw_rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
u32 drv_rx_counter = wl->rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
+ u32 rx_counter;
u32 mem_block;
+ u32 pkt_length;
+ u32 pkt_offset;

while (drv_rx_counter != fw_rx_counter) {
- mem_block = wl1271_rx_get_mem_block(status, drv_rx_counter);
- buf_size = wl1271_rx_get_buf_size(status, drv_rx_counter);
+ buf_size = 0;
+ rx_counter = drv_rx_counter;
+ while (rx_counter != fw_rx_counter) {
+ pkt_length = wl1271_rx_get_buf_size(status, rx_counter);
+ if (buf_size + pkt_length > WL1271_AGGR_BUFFER_SIZE)
+ break;
+ buf_size += pkt_length;
+ rx_counter++;
+ rx_counter &= NUM_RX_PKT_DESC_MOD_MASK;
+ }

if (buf_size == 0) {
wl1271_warning("received empty data");
break;
}

+ /*
+ * Choose the block we want to read
+ * For aggregated packets, only the first memory block should
+ * be retrieved. The FW takes care of the rest.
+ */
+ mem_block = wl1271_rx_get_mem_block(status, drv_rx_counter);
wl->rx_mem_pool_addr.addr = (mem_block << 8) +
le32_to_cpu(wl_mem_map->packet_memory_pool_start);
wl->rx_mem_pool_addr.addr_extra =
wl->rx_mem_pool_addr.addr + 4;
-
- /* Choose the block we want to read */
wl1271_write(wl, WL1271_SLV_REG_DATA, &wl->rx_mem_pool_addr,
- sizeof(wl->rx_mem_pool_addr), false);
-
- wl1271_rx_handle_data(wl, buf_size);
-
- wl->rx_counter++;
- drv_rx_counter = wl->rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
+ sizeof(wl->rx_mem_pool_addr), false);
+
+ /* Read all available packets at once */
+ wl1271_read(wl, WL1271_SLV_MEM_DATA, wl->aggr_buf,
+ buf_size, true);
+
+ /* Split data into separate packets */
+ pkt_offset = 0;
+ while (pkt_offset < buf_size) {
+ pkt_length = wl1271_rx_get_buf_size(status,
+ drv_rx_counter);
+ if (wl1271_rx_handle_data(wl,
+ wl->aggr_buf + pkt_offset,
+ pkt_length) < 0)
+ break;
+ wl->rx_counter++;
+ drv_rx_counter++;
+ drv_rx_counter &= NUM_RX_PKT_DESC_MOD_MASK;
+ pkt_offset += pkt_length;
+ }
}
-
- wl1271_write32(wl, RX_DRIVER_COUNTER_ADDRESS, wl->rx_counter);
+ wl1271_write32(wl, RX_DRIVER_COUNTER_ADDRESS,
+ cpu_to_le32(wl->rx_counter));
}
--
1.7.0.4


2010-09-29 06:28:35

by Juuso Oikarinen

[permalink] [raw]
Subject: Re: [PATCH 0/2] wl1271: Packet aggregation optimizations

On Tue, 2010-09-28 at 17:07 +0200, ext Ido Yariv wrote:
> The WL1271 HW supports sending and receiving multiple packets within
> a single transaction. This can substantially reduce CPU usage and
> increase network throughput.
>
> The following patches add support for RX and TX packet aggregation.
> An opportunistic approach was taken - all packets that are currently
> available for sending/receiving are aggregated and transferred
> in a single transaction.
>
> The patches were tested on a Zoom2 platform (SDIO only).
>
> Ido Yariv (2):
> wl1271: Support firmware RX packet aggregation
> wl1271: Support firmware TX packet aggregation
>

Thanks for these patches!

We are testing them now.

-Juuso


2010-09-28 15:07:58

by Ido Yariv

[permalink] [raw]
Subject: [PATCH 2/2] wl1271: Support firmware TX packet aggregation

Instead of sending one packet at a time to the firmware, try to
send all available packets at once.
This optimization decreases the number of transactions, which saves
CPU cycles and increases network throughput.

Signed-off-by: Ido Yariv <[email protected]>
---
drivers/net/wireless/wl12xx/wl1271_tx.c | 99 ++++++++++++-------------------
1 files changed, 37 insertions(+), 62 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/wl1271_tx.c b/drivers/net/wireless/wl12xx/wl1271_tx.c
index 1b8295c..e3dc13c 100644
--- a/drivers/net/wireless/wl12xx/wl1271_tx.c
+++ b/drivers/net/wireless/wl12xx/wl1271_tx.c
@@ -43,13 +43,17 @@ static int wl1271_tx_id(struct wl1271 *wl, struct sk_buff *skb)
return -EBUSY;
}

-static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra)
+static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra,
+ u32 buf_offset)
{
struct wl1271_tx_hw_descr *desc;
u32 total_len = skb->len + sizeof(struct wl1271_tx_hw_descr) + extra;
u32 total_blocks;
int id, ret = -EBUSY;

+ if (buf_offset + total_len > WL1271_AGGR_BUFFER_SIZE)
+ return -EBUSY;
+
/* allocate free identifier for the packet */
id = wl1271_tx_id(wl, skb);
if (id < 0)
@@ -82,7 +86,7 @@ static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra)
return ret;
}

-static int wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
+static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
u32 extra, struct ieee80211_tx_info *control)
{
struct timespec ts;
@@ -133,59 +137,17 @@ static int wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
desc->tx_attr = cpu_to_le16(tx_attr);

wl1271_debug(DEBUG_TX, "tx_fill_hdr: pad: %d", pad);
- return 0;
-}
-
-static int wl1271_tx_send_packet(struct wl1271 *wl, struct sk_buff *skb,
- struct ieee80211_tx_info *control)
-{
-
- struct wl1271_tx_hw_descr *desc;
- int len;
-
- /* FIXME: This is a workaround for getting non-aligned packets.
- This happens at least with EAPOL packets from the user space.
- Our DMA requires packets to be aligned on a 4-byte boundary.
- */
- if (unlikely((long)skb->data & 0x03)) {
- int offset = (4 - (long)skb->data) & 0x03;
- wl1271_debug(DEBUG_TX, "skb offset %d", offset);
-
- /* check whether the current skb can be used */
- if (!skb_cloned(skb) && (skb_tailroom(skb) >= offset)) {
- unsigned char *src = skb->data;
-
- /* align the buffer on a 4-byte boundary */
- skb_reserve(skb, offset);
- memmove(skb->data, src, skb->len);
- } else {
- wl1271_info("No handler, fixme!");
- return -EINVAL;
- }
- }
-
- len = WL1271_TX_ALIGN(skb->len);
-
- /* perform a fixed address block write with the packet */
- wl1271_write(wl, WL1271_SLV_MEM_DATA, skb->data, len, true);
-
- /* write packet new counter into the write access register */
- wl->tx_packets_count++;
-
- desc = (struct wl1271_tx_hw_descr *) skb->data;
- wl1271_debug(DEBUG_TX, "tx id %u skb 0x%p payload %u (%u words)",
- desc->id, skb, len, desc->length);
-
- return 0;
}

/* caller must hold wl->mutex */
-static int wl1271_tx_frame(struct wl1271 *wl, struct sk_buff *skb)
+static int wl1271_prepare_tx_frame(struct wl1271 *wl, struct sk_buff *skb,
+ u32 buf_offset)
{
struct ieee80211_tx_info *info;
u32 extra = 0;
int ret = 0;
u8 idx;
+ u32 total_len;

if (!skb)
return -EINVAL;
@@ -208,19 +170,22 @@ static int wl1271_tx_frame(struct wl1271 *wl, struct sk_buff *skb)
}
}

- ret = wl1271_tx_allocate(wl, skb, extra);
+ ret = wl1271_tx_allocate(wl, skb, extra, buf_offset);
if (ret < 0)
return ret;

- ret = wl1271_tx_fill_hdr(wl, skb, extra, info);
- if (ret < 0)
- return ret;
+ wl1271_tx_fill_hdr(wl, skb, extra, info);

- ret = wl1271_tx_send_packet(wl, skb, info);
- if (ret < 0)
- return ret;
+ /*
+ * The length of each packet is stored in terms of words. Thus, we must
+ * pad the skb data to make sure its length is aligned.
+ * The number of padding bytes is computed and set in wl1271_tx_fill_hdr
+ */
+ total_len = WL1271_TX_ALIGN(skb->len);
+ memcpy(wl->aggr_buf + buf_offset, skb->data, skb->len);
+ memset(wl->aggr_buf + buf_offset + skb->len, 0, total_len - skb->len);

- return ret;
+ return total_len;
}

u32 wl1271_tx_enabled_rates_get(struct wl1271 *wl, u32 rate_set)
@@ -245,7 +210,7 @@ void wl1271_tx_work(struct work_struct *work)
struct sk_buff *skb;
bool woken_up = false;
u32 sta_rates = 0;
- u32 prev_tx_packets_count;
+ u32 buf_offset;
int ret;

/* check if the rates supported by the AP have changed */
@@ -262,14 +227,15 @@ void wl1271_tx_work(struct work_struct *work)
if (unlikely(wl->state == WL1271_STATE_OFF))
goto out;

- prev_tx_packets_count = wl->tx_packets_count;
-
/* if rates have changed, re-configure the rate policy */
if (unlikely(sta_rates)) {
wl->rate_set = wl1271_tx_enabled_rates_get(wl, sta_rates);
wl1271_acx_rate_policies(wl);
}

+ /* Prepare the transfer buffer, by aggregating all
+ * available packets */
+ buf_offset = 0;
while ((skb = skb_dequeue(&wl->tx_queue))) {
if (!woken_up) {
ret = wl1271_ps_elp_wakeup(wl, false);
@@ -278,21 +244,30 @@ void wl1271_tx_work(struct work_struct *work)
woken_up = true;
}

- ret = wl1271_tx_frame(wl, skb);
+ ret = wl1271_prepare_tx_frame(wl, skb, buf_offset);
if (ret == -EBUSY) {
- /* firmware buffer is full, lets stop transmitting. */
+ /*
+ * Either the firmware buffer is full, or the
+ * aggregation buffer is.
+ * Queue back last skb, and stop aggregating.
+ */
skb_queue_head(&wl->tx_queue, skb);
goto out_ack;
} else if (ret < 0) {
dev_kfree_skb(skb);
goto out_ack;
}
+ buf_offset += ret;
+ wl->tx_packets_count++;
}

out_ack:
- /* interrupt the firmware with the new packets */
- if (prev_tx_packets_count != wl->tx_packets_count)
+ if (buf_offset) {
+ wl1271_write(wl, WL1271_SLV_MEM_DATA, wl->aggr_buf,
+ buf_offset, true);
+ /* interrupt the firmware with the new packets */
wl1271_write32(wl, WL1271_HOST_WR_ACCESS, wl->tx_packets_count);
+ }

out:
if (woken_up)
--
1.7.0.4