LinuxLists.cc - [PATCH v3 0/3] wl1271: Packet aggregation optimizations

2010-09-30 13:28:35

Subject: [PATCH v3 0/3] wl1271: Packet aggregation optimizations

The WL1271 HW supports sending and receiving multiple packets within
a single transaction. This can substantially reduce CPU usage and
increase network throughput.

The following patches add support for RX and TX packet aggregation.
An opportunistic approach was taken - all packets that are currently
available for sending/receiving are aggregated and transferred
in a single transaction.

On SPI platforms, tests done by Juuso Oikarinen have shown a 20% improvement
in CPU usage.
On SDIO platforms, some tests have shown an improvement of over 30% in CPU
usage.

Changes since v2:
Fix debug output in the SPI callbacks.

Changes since v1:
The SPI callbacks do not handle large transfers well. The first patch fixes
this by fragmenting large transfers into smaller ones.

Thanks Juuso Oikarinen for reporting the SPI issue and helping with testing
and benchmarking these optimizations.

Ido Yariv (3):
wl1271: Handle large SPI transfers
wl1271: Support firmware RX packet aggregation
wl1271: Support firmware TX packet aggregation

drivers/net/wireless/wl12xx/wl1271.h | 5 +
drivers/net/wireless/wl12xx/wl1271_main.c | 15 +++-
drivers/net/wireless/wl12xx/wl1271_rx.c | 63 ++++++++++----
drivers/net/wireless/wl12xx/wl1271_spi.c | 140 ++++++++++++++++++-----------
drivers/net/wireless/wl12xx/wl1271_tx.c | 99 ++++++++-------------
5 files changed, 189 insertions(+), 133 deletions(-)

2010-09-30 13:28:41

by Ido Yariv

[permalink] [raw]

Subject: [PATCH v3 3/3] wl1271: Support firmware TX packet aggregation

Instead of sending one packet at a time to the firmware, try to
send all available packets at once.
This optimization decreases the number of transactions, which saves
CPU cycles and increases network throughput.

Signed-off-by: Ido Yariv <[email protected]>
---
drivers/net/wireless/wl12xx/wl1271_tx.c | 99 ++++++++++++-------------------
1 files changed, 37 insertions(+), 62 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/wl1271_tx.c b/drivers/net/wireless/wl12xx/wl1271_tx.c
index 1b8295c..e3dc13c 100644
--- a/drivers/net/wireless/wl12xx/wl1271_tx.c
+++ b/drivers/net/wireless/wl12xx/wl1271_tx.c
@@ -43,13 +43,17 @@ static int wl1271_tx_id(struct wl1271 *wl, struct sk_buff *skb)
return -EBUSY;
}

-static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra)
+static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra,
+ u32 buf_offset)
{
struct wl1271_tx_hw_descr *desc;
u32 total_len = skb->len + sizeof(struct wl1271_tx_hw_descr) + extra;
u32 total_blocks;
int id, ret = -EBUSY;

+ if (buf_offset + total_len > WL1271_AGGR_BUFFER_SIZE)
+ return -EBUSY;
+
/* allocate free identifier for the packet */
id = wl1271_tx_id(wl, skb);
if (id < 0)
@@ -82,7 +86,7 @@ static int wl1271_tx_allocate(struct wl1271 *wl, struct sk_buff *skb, u32 extra)
return ret;
}

-static int wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
+static void wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
u32 extra, struct ieee80211_tx_info *control)
{
struct timespec ts;
@@ -133,59 +137,17 @@ static int wl1271_tx_fill_hdr(struct wl1271 *wl, struct sk_buff *skb,
desc->tx_attr = cpu_to_le16(tx_attr);

wl1271_debug(DEBUG_TX, "tx_fill_hdr: pad: %d", pad);
- return 0;
-}
-
-static int wl1271_tx_send_packet(struct wl1271 *wl, struct sk_buff *skb,
- struct ieee80211_tx_info *control)
-{
-
- struct wl1271_tx_hw_descr *desc;
- int len;
-
- /* FIXME: This is a workaround for getting non-aligned packets.
- This happens at least with EAPOL packets from the user space.
- Our DMA requires packets to be aligned on a 4-byte boundary.
- */
- if (unlikely((long)skb->data & 0x03)) {
- int offset = (4 - (long)skb->data) & 0x03;
- wl1271_debug(DEBUG_TX, "skb offset %d", offset);
-
- /* check whether the current skb can be used */
- if (!skb_cloned(skb) && (skb_tailroom(skb) >= offset)) {
- unsigned char *src = skb->data;
-
- /* align the buffer on a 4-byte boundary */
- skb_reserve(skb, offset);
- memmove(skb->data, src, skb->len);
- } else {
- wl1271_info("No handler, fixme!");
- return -EINVAL;
- }
- }
-
- len = WL1271_TX_ALIGN(skb->len);
-
- /* perform a fixed address block write with the packet */
- wl1271_write(wl, WL1271_SLV_MEM_DATA, skb->data, len, true);
-
- /* write packet new counter into the write access register */
- wl->tx_packets_count++;
-
- desc = (struct wl1271_tx_hw_descr *) skb->data;
- wl1271_debug(DEBUG_TX, "tx id %u skb 0x%p payload %u (%u words)",
- desc->id, skb, len, desc->length);
-
- return 0;
}

/* caller must hold wl->mutex */
-static int wl1271_tx_frame(struct wl1271 *wl, struct sk_buff *skb)
+static int wl1271_prepare_tx_frame(struct wl1271 *wl, struct sk_buff *skb,
+ u32 buf_offset)
{
struct ieee80211_tx_info *info;
u32 extra = 0;
int ret = 0;
u8 idx;
+ u32 total_len;

if (!skb)
return -EINVAL;
@@ -208,19 +170,22 @@ static int wl1271_tx_frame(struct wl1271 *wl, struct sk_buff *skb)
}
}

- ret = wl1271_tx_allocate(wl, skb, extra);
+ ret = wl1271_tx_allocate(wl, skb, extra, buf_offset);
if (ret < 0)
return ret;

- ret = wl1271_tx_fill_hdr(wl, skb, extra, info);
- if (ret < 0)
- return ret;
+ wl1271_tx_fill_hdr(wl, skb, extra, info);

- ret = wl1271_tx_send_packet(wl, skb, info);
- if (ret < 0)
- return ret;
+ /*
+ * The length of each packet is stored in terms of words. Thus, we must
+ * pad the skb data to make sure its length is aligned.
+ * The number of padding bytes is computed and set in wl1271_tx_fill_hdr
+ */
+ total_len = WL1271_TX_ALIGN(skb->len);
+ memcpy(wl->aggr_buf + buf_offset, skb->data, skb->len);
+ memset(wl->aggr_buf + buf_offset + skb->len, 0, total_len - skb->len);

- return ret;
+ return total_len;
}

u32 wl1271_tx_enabled_rates_get(struct wl1271 *wl, u32 rate_set)
@@ -245,7 +210,7 @@ void wl1271_tx_work(struct work_struct *work)
struct sk_buff *skb;
bool woken_up = false;
u32 sta_rates = 0;
- u32 prev_tx_packets_count;
+ u32 buf_offset;
int ret;

/* check if the rates supported by the AP have changed */
@@ -262,14 +227,15 @@ void wl1271_tx_work(struct work_struct *work)
if (unlikely(wl->state == WL1271_STATE_OFF))
goto out;

- prev_tx_packets_count = wl->tx_packets_count;
-
/* if rates have changed, re-configure the rate policy */
if (unlikely(sta_rates)) {
wl->rate_set = wl1271_tx_enabled_rates_get(wl, sta_rates);
wl1271_acx_rate_policies(wl);
}

+ /* Prepare the transfer buffer, by aggregating all
+ * available packets */
+ buf_offset = 0;
while ((skb = skb_dequeue(&wl->tx_queue))) {
if (!woken_up) {
ret = wl1271_ps_elp_wakeup(wl, false);
@@ -278,21 +244,30 @@ void wl1271_tx_work(struct work_struct *work)
woken_up = true;
}

- ret = wl1271_tx_frame(wl, skb);
+ ret = wl1271_prepare_tx_frame(wl, skb, buf_offset);
if (ret == -EBUSY) {
- /* firmware buffer is full, lets stop transmitting. */
+ /*
+ * Either the firmware buffer is full, or the
+ * aggregation buffer is.
+ * Queue back last skb, and stop aggregating.
+ */
skb_queue_head(&wl->tx_queue, skb);
goto out_ack;
} else if (ret < 0) {
dev_kfree_skb(skb);
goto out_ack;
}
+ buf_offset += ret;
+ wl->tx_packets_count++;
}

out_ack:
- /* interrupt the firmware with the new packets */
- if (prev_tx_packets_count != wl->tx_packets_count)
+ if (buf_offset) {
+ wl1271_write(wl, WL1271_SLV_MEM_DATA, wl->aggr_buf,
+ buf_offset, true);
+ /* interrupt the firmware with the new packets */
wl1271_write32(wl, WL1271_HOST_WR_ACCESS, wl->tx_packets_count);
+ }

out:
if (woken_up)
--
1.7.0.4

2010-09-30 13:28:39

by Ido Yariv

[permalink] [raw]

Subject: [PATCH v3 1/3] wl1271: Handle large SPI transfers

The HW supports up to 4095 bytes transfers via SPI. The SPI read & write
operations do not handle larger transfers, causing the HW to stall in such
cases.

Fix this by fragmenting large transfers into smaller chunks, and
transferring each one separately.

Signed-off-by: Ido Yariv <[email protected]>
Tested-by: Juuso Oikarinen <[email protected]>
---
drivers/net/wireless/wl12xx/wl1271_spi.c | 140 ++++++++++++++++++------------
1 files changed, 86 insertions(+), 54 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/wl1271_spi.c b/drivers/net/wireless/wl12xx/wl1271_spi.c
index 75cbf36..ef80168 100644
--- a/drivers/net/wireless/wl12xx/wl1271_spi.c
+++ b/drivers/net/wireless/wl12xx/wl1271_spi.c
@@ -63,6 +63,11 @@
((WL1271_BUSY_WORD_LEN - 4) / sizeof(u32))
#define HW_ACCESS_WSPI_INIT_CMD_MASK 0

+/* HW limitation: maximum possible chunk size is 4095 bytes */
+#define WSPI_MAX_CHUNK_SIZE 4092
+
+#define WSPI_MAX_NUM_OF_CHUNKS (WL1271_AGGR_BUFFER_SIZE / WSPI_MAX_CHUNK_SIZE)
+
static inline struct spi_device *wl_to_spi(struct wl1271 *wl)
{
return wl->if_priv;
@@ -202,90 +207,117 @@ static int wl1271_spi_read_busy(struct wl1271 *wl)
static void wl1271_spi_raw_read(struct wl1271 *wl, int addr, void *buf,
size_t len, bool fixed)
{
- struct spi_transfer t[3];
+ struct spi_transfer t[2];
struct spi_message m;
u32 *busy_buf;
u32 *cmd;
+ u32 chunk_len;

- cmd = &wl->buffer_cmd;
- busy_buf = wl->buffer_busyword;
+ while (len > 0) {
+ chunk_len = min((size_t)WSPI_MAX_CHUNK_SIZE, len);

- *cmd = 0;
- *cmd |= WSPI_CMD_READ;
- *cmd |= (len << WSPI_CMD_BYTE_LENGTH_OFFSET) & WSPI_CMD_BYTE_LENGTH;
- *cmd |= addr & WSPI_CMD_BYTE_ADDR;
+ cmd = &wl->buffer_cmd;
+ busy_buf = wl->buffer_busyword;

- if (fixed)
- *cmd |= WSPI_CMD_FIXED;
+ *cmd = 0;
+ *cmd |= WSPI_CMD_READ;
+ *cmd |= (chunk_len << WSPI_CMD_BYTE_LENGTH_OFFSET) &
+ WSPI_CMD_BYTE_LENGTH;
+ *cmd |= addr & WSPI_CMD_BYTE_ADDR;

- spi_message_init(&m);
- memset(t, 0, sizeof(t));
+ if (fixed)
+ *cmd |= WSPI_CMD_FIXED;

- t[0].tx_buf = cmd;
- t[0].len = 4;
- t[0].cs_change = true;
- spi_message_add_tail(&t[0], &m);
+ spi_message_init(&m);
+ memset(t, 0, sizeof(t));

- /* Busy and non busy words read */
- t[1].rx_buf = busy_buf;
- t[1].len = WL1271_BUSY_WORD_LEN;
- t[1].cs_change = true;
- spi_message_add_tail(&t[1], &m);
+ t[0].tx_buf = cmd;
+ t[0].len = 4;
+ t[0].cs_change = true;
+ spi_message_add_tail(&t[0], &m);

- spi_sync(wl_to_spi(wl), &m);
+ /* Busy and non busy words read */
+ t[1].rx_buf = busy_buf;
+ t[1].len = WL1271_BUSY_WORD_LEN;
+ t[1].cs_change = true;
+ spi_message_add_tail(&t[1], &m);

- if (!(busy_buf[WL1271_BUSY_WORD_CNT - 1] & 0x1) &&
- wl1271_spi_read_busy(wl)) {
- memset(buf, 0, len);
- return;
- }
+ spi_sync(wl_to_spi(wl), &m);

- spi_message_init(&m);
- memset(t, 0, sizeof(t));
+ if (!(busy_buf[WL1271_BUSY_WORD_CNT - 1] & 0x1) &&
+ wl1271_spi_read_busy(wl)) {
+ memset(buf, 0, chunk_len);
+ return;
+ }

- t[0].rx_buf = buf;
- t[0].len = len;
- t[0].cs_change = true;
- spi_message_add_tail(&t[0], &m);
+ spi_message_init(&m);
+ memset(t, 0, sizeof(t));

- spi_sync(wl_to_spi(wl), &m);
+ t[0].rx_buf = buf;
+ t[0].len = chunk_len;
+ t[0].cs_change = true;
+ spi_message_add_tail(&t[0], &m);
+
+ spi_sync(wl_to_spi(wl), &m);

- wl1271_dump(DEBUG_SPI, "spi_read cmd -> ", cmd, sizeof(*cmd));
- wl1271_dump(DEBUG_SPI, "spi_read buf <- ", buf, len);
+ wl1271_dump(DEBUG_SPI, "spi_read cmd -> ", cmd, sizeof(*cmd));
+ wl1271_dump(DEBUG_SPI, "spi_read buf <- ", buf, chunk_len);
+
+ if (!fixed)
+ addr += chunk_len;
+ buf += chunk_len;
+ len -= chunk_len;
+ }
}

static void wl1271_spi_raw_write(struct wl1271 *wl, int addr, void *buf,
size_t len, bool fixed)
{
- struct spi_transfer t[2];
+ struct spi_transfer t[2 * WSPI_MAX_NUM_OF_CHUNKS];
struct spi_message m;
+ u32 commands[WSPI_MAX_NUM_OF_CHUNKS];
u32 *cmd;
+ u32 chunk_len;
+ int i;

- cmd = &wl->buffer_cmd;
-
- *cmd = 0;
- *cmd |= WSPI_CMD_WRITE;
- *cmd |= (len << WSPI_CMD_BYTE_LENGTH_OFFSET) & WSPI_CMD_BYTE_LENGTH;
- *cmd |= addr & WSPI_CMD_BYTE_ADDR;
-
- if (fixed)
- *cmd |= WSPI_CMD_FIXED;
+ WARN_ON(len > WL1271_AGGR_BUFFER_SIZE);

spi_message_init(&m);
memset(t, 0, sizeof(t));

- t[0].tx_buf = cmd;
- t[0].len = sizeof(*cmd);
- spi_message_add_tail(&t[0], &m);
+ cmd = &commands[0];
+ i = 0;
+ while (len > 0) {
+ chunk_len = min((size_t)WSPI_MAX_CHUNK_SIZE, len);

- t[1].tx_buf = buf;
- t[1].len = len;
- spi_message_add_tail(&t[1], &m);
+ *cmd = 0;
+ *cmd |= WSPI_CMD_WRITE;
+ *cmd |= (chunk_len << WSPI_CMD_BYTE_LENGTH_OFFSET) &
+ WSPI_CMD_BYTE_LENGTH;
+ *cmd |= addr & WSPI_CMD_BYTE_ADDR;

- spi_sync(wl_to_spi(wl), &m);
+ if (fixed)
+ *cmd |= WSPI_CMD_FIXED;
+
+ t[i].tx_buf = cmd;
+ t[i].len = sizeof(*cmd);
+ spi_message_add_tail(&t[i++], &m);
+
+ t[i].tx_buf = buf;
+ t[i].len = chunk_len;
+ spi_message_add_tail(&t[i++], &m);

- wl1271_dump(DEBUG_SPI, "spi_write cmd -> ", cmd, sizeof(*cmd));
- wl1271_dump(DEBUG_SPI, "spi_write buf -> ", buf, len);
+ wl1271_dump(DEBUG_SPI, "spi_write cmd -> ", cmd, sizeof(*cmd));
+ wl1271_dump(DEBUG_SPI, "spi_write buf -> ", buf, chunk_len);
+
+ if (!fixed)
+ addr += chunk_len;
+ buf += chunk_len;
+ len -= chunk_len;
+ cmd++;
+ }
+
+ spi_sync(wl_to_spi(wl), &m);
}

static irqreturn_t wl1271_irq(int irq, void *cookie)
--
1.7.0.4

2010-09-30 19:25:09

by Luciano Coelho

[permalink] [raw]

Subject: Re: [PATCH v3 0/3] wl1271: Packet aggregation optimizations

On Thu, 2010-09-30 at 15:28 +0200, ext Ido Yariv wrote:
> The WL1271 HW supports sending and receiving multiple packets within
> a single transaction. This can substantially reduce CPU usage and
> increase network throughput.
>
> The following patches add support for RX and TX packet aggregation.
> An opportunistic approach was taken - all packets that are currently
> available for sending/receiving are aggregated and transferred
> in a single transaction.
>
> On SPI platforms, tests done by Juuso Oikarinen have shown a 20% improvement
> in CPU usage.
> On SDIO platforms, some tests have shown an improvement of over 30% in CPU
> usage.
>
> Changes since v2:
> Fix debug output in the SPI callbacks.
>
> Changes since v1:
> The SPI callbacks do not handle large transfers well. The first patch fixes
> this by fragmenting large transfers into smaller ones.
>
> Thanks Juuso Oikarinen for reporting the SPI issue and helping with testing
> and benchmarking these optimizations.

This looks very good. Thanks for your work and thanks Juuso for the
tests and verification.

I'll apply this as soon as John pulls our wl12xx tree, so we can keep it
linear.

--
Cheers,
Luca.

2010-09-30 13:28:40

by Ido Yariv

[permalink] [raw]

Subject: [PATCH v3 2/3] wl1271: Support firmware RX packet aggregation

Instead of retrieving one packet at a time from the firmware, try to
retrieve all available packets at once.
This optimization decreases the number of transactions, which saves
CPU cycles and increases network throughput.

Signed-off-by: Ido Yariv <[email protected]>
---
drivers/net/wireless/wl12xx/wl1271.h | 5 ++
drivers/net/wireless/wl12xx/wl1271_main.c | 15 ++++++-
drivers/net/wireless/wl12xx/wl1271_rx.c | 63 +++++++++++++++++++++-------
3 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/wl12xx/wl1271.h b/drivers/net/wireless/wl12xx/wl1271.h
index 779b130..8a4cd76 100644
--- a/drivers/net/wireless/wl12xx/wl1271.h
+++ b/drivers/net/wireless/wl12xx/wl1271.h
@@ -130,6 +130,8 @@ enum {

#define ACX_TX_DESCRIPTORS 32

+#define WL1271_AGGR_BUFFER_SIZE (4 * PAGE_SIZE)
+
enum wl1271_state {
WL1271_STATE_OFF,
WL1271_STATE_ON,
@@ -408,6 +410,9 @@ struct wl1271 {
/* Rx memory pool address */
struct wl1271_rx_mem_pool_addr rx_mem_pool_addr;

+ /* Intermediate buffer, used for packet aggregation */
+ u8 *aggr_buf;
+
/* The target interrupt mask */
struct work_struct irq_work;

diff --git a/drivers/net/wireless/wl12xx/wl1271_main.c b/drivers/net/wireless/wl12xx/wl1271_main.c
index cb18f22..8071da1 100644
--- a/drivers/net/wireless/wl12xx/wl1271_main.c
+++ b/drivers/net/wireless/wl12xx/wl1271_main.c
@@ -2459,6 +2459,7 @@ struct ieee80211_hw *wl1271_alloc_hw(void)
struct platform_device *plat_dev = NULL;
struct wl1271 *wl;
int i, ret;
+ unsigned int order;

hw = ieee80211_alloc_hw(sizeof(*wl), &wl1271_ops);
if (!hw) {
@@ -2517,11 +2518,18 @@ struct ieee80211_hw *wl1271_alloc_hw(void)

wl1271_debugfs_init(wl);

+ order = get_order(WL1271_AGGR_BUFFER_SIZE);
+ wl->aggr_buf = (u8 *)__get_free_pages(GFP_KERNEL, order);
+ if (!wl->aggr_buf) {
+ ret = -ENOMEM;
+ goto err_hw;
+ }
+
/* Register platform device */
ret = platform_device_register(wl->plat_dev);
if (ret) {
wl1271_error("couldn't register platform device");
- goto err_hw;
+ goto err_aggr;
}
dev_set_drvdata(&wl->plat_dev->dev, wl);

@@ -2547,6 +2555,9 @@ err_bt_coex_state:
err_platform:
platform_device_unregister(wl->plat_dev);

+err_aggr:
+ free_pages((unsigned long)wl->aggr_buf, order);
+
err_hw:
wl1271_debugfs_exit(wl);
kfree(plat_dev);
@@ -2563,6 +2574,8 @@ EXPORT_SYMBOL_GPL(wl1271_alloc_hw);
int wl1271_free_hw(struct wl1271 *wl)
{
platform_device_unregister(wl->plat_dev);
+ free_pages((unsigned long)wl->aggr_buf,
+ get_order(WL1271_AGGR_BUFFER_SIZE));
kfree(wl->plat_dev);

wl1271_debugfs_exit(wl);
diff --git a/drivers/net/wireless/wl12xx/wl1271_rx.c b/drivers/net/wireless/wl12xx/wl1271_rx.c
index 94da5dd..bea133b 100644
--- a/drivers/net/wireless/wl12xx/wl1271_rx.c
+++ b/drivers/net/wireless/wl12xx/wl1271_rx.c
@@ -74,7 +74,7 @@ static void wl1271_rx_status(struct wl1271 *wl,
}
}

-static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
+static int wl1271_rx_handle_data(struct wl1271 *wl, u8 *data, u32 length)
{
struct wl1271_rx_descriptor *desc;
struct sk_buff *skb;
@@ -87,16 +87,16 @@ static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
* workaround this by not retrieving them at all.
*/
if (unlikely(wl->state == WL1271_STATE_PLT))
- return;
+ return -EINVAL;

skb = __dev_alloc_skb(length, GFP_KERNEL);
if (!skb) {
wl1271_error("Couldn't allocate RX frame");
- return;
+ return -ENOMEM;
}

buf = skb_put(skb, length);
- wl1271_read(wl, WL1271_SLV_MEM_DATA, buf, length, true);
+ memcpy(buf, data, length);

/* the data read starts with the descriptor */
desc = (struct wl1271_rx_descriptor *) buf;
@@ -116,6 +116,8 @@ static void wl1271_rx_handle_data(struct wl1271 *wl, u32 length)
skb_trim(skb, skb->len - desc->pad_len);

ieee80211_rx_ni(wl->hw, skb);
+
+ return 0;
}

void wl1271_rx(struct wl1271 *wl, struct wl1271_fw_status *status)
@@ -124,31 +126,60 @@ void wl1271_rx(struct wl1271 *wl, struct wl1271_fw_status *status)
u32 buf_size;
u32 fw_rx_counter = status->fw_rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
u32 drv_rx_counter = wl->rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
+ u32 rx_counter;
u32 mem_block;
+ u32 pkt_length;
+ u32 pkt_offset;

while (drv_rx_counter != fw_rx_counter) {
- mem_block = wl1271_rx_get_mem_block(status, drv_rx_counter);
- buf_size = wl1271_rx_get_buf_size(status, drv_rx_counter);
+ buf_size = 0;
+ rx_counter = drv_rx_counter;
+ while (rx_counter != fw_rx_counter) {
+ pkt_length = wl1271_rx_get_buf_size(status, rx_counter);
+ if (buf_size + pkt_length > WL1271_AGGR_BUFFER_SIZE)
+ break;
+ buf_size += pkt_length;
+ rx_counter++;
+ rx_counter &= NUM_RX_PKT_DESC_MOD_MASK;
+ }

if (buf_size == 0) {
wl1271_warning("received empty data");
break;
}

+ /*
+ * Choose the block we want to read
+ * For aggregated packets, only the first memory block should
+ * be retrieved. The FW takes care of the rest.
+ */
+ mem_block = wl1271_rx_get_mem_block(status, drv_rx_counter);
wl->rx_mem_pool_addr.addr = (mem_block << 8) +
le32_to_cpu(wl_mem_map->packet_memory_pool_start);
wl->rx_mem_pool_addr.addr_extra =
wl->rx_mem_pool_addr.addr + 4;
-
- /* Choose the block we want to read */
wl1271_write(wl, WL1271_SLV_REG_DATA, &wl->rx_mem_pool_addr,
- sizeof(wl->rx_mem_pool_addr), false);
-
- wl1271_rx_handle_data(wl, buf_size);
-
- wl->rx_counter++;
- drv_rx_counter = wl->rx_counter & NUM_RX_PKT_DESC_MOD_MASK;
+ sizeof(wl->rx_mem_pool_addr), false);
+
+ /* Read all available packets at once */
+ wl1271_read(wl, WL1271_SLV_MEM_DATA, wl->aggr_buf,
+ buf_size, true);
+
+ /* Split data into separate packets */
+ pkt_offset = 0;
+ while (pkt_offset < buf_size) {
+ pkt_length = wl1271_rx_get_buf_size(status,
+ drv_rx_counter);
+ if (wl1271_rx_handle_data(wl,
+ wl->aggr_buf + pkt_offset,
+ pkt_length) < 0)
+ break;
+ wl->rx_counter++;
+ drv_rx_counter++;
+ drv_rx_counter &= NUM_RX_PKT_DESC_MOD_MASK;
+ pkt_offset += pkt_length;
+ }
}
-
- wl1271_write32(wl, RX_DRIVER_COUNTER_ADDRESS, wl->rx_counter);
+ wl1271_write32(wl, RX_DRIVER_COUNTER_ADDRESS,
+ cpu_to_le32(wl->rx_counter));
}
--
1.7.0.4

2010-10-05 11:18:31

by Luciano Coelho

[permalink] [raw]

Subject: Re: [PATCH v3 0/3] wl1271: Packet aggregation optimizations

On Thu, 2010-09-30 at 15:28 +0200, ext Ido Yariv wrote:
> The WL1271 HW supports sending and receiving multiple packets within
> a single transaction. This can substantially reduce CPU usage and
> increase network throughput.
>
> The following patches add support for RX and TX packet aggregation.
> An opportunistic approach was taken - all packets that are currently
> available for sending/receiving are aggregated and transferred
> in a single transaction.
>
> On SPI platforms, tests done by Juuso Oikarinen have shown a 20% improvement
> in CPU usage.
> On SDIO platforms, some tests have shown an improvement of over 30% in CPU
> usage.
>
> Changes since v2:
> Fix debug output in the SPI callbacks.
>
> Changes since v1:
> The SPI callbacks do not handle large transfers well. The first patch fixes
> this by fragmenting large transfers into smaller ones.
>
> Thanks Juuso Oikarinen for reporting the SPI issue and helping with testing
> and benchmarking these optimizations.
>
> Ido Yariv (3):
> wl1271: Handle large SPI transfers
> wl1271: Support firmware RX packet aggregation
> wl1271: Support firmware TX packet aggregation
>
> drivers/net/wireless/wl12xx/wl1271.h | 5 +
> drivers/net/wireless/wl12xx/wl1271_main.c | 15 +++-
> drivers/net/wireless/wl12xx/wl1271_rx.c | 63 ++++++++++----
> drivers/net/wireless/wl12xx/wl1271_spi.c | 140 ++++++++++++++++++-----------
> drivers/net/wireless/wl12xx/wl1271_tx.c | 99 ++++++++-------------
> 5 files changed, 189 insertions(+), 133 deletions(-)
>

Thanks, applied to the wl12xx tree, master branch.

--
Cheers,
Luca.