2015-11-23 13:40:25

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 1/2] ath10k: do not use coherent memory for allocated device memory chunks

Coherent memory is more expensive to allocate (and constrained on some
architectures where it has to be pre-allocated). It is also completely
unnecessary, since the host has no reason to even access these allocated
memory spaces

Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath10k/wmi.c | 59 +++++++++++++++++++++++++----------
1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 9021079..6cd097c 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c
@@ -4300,34 +4300,58 @@ void ath10k_wmi_event_vdev_resume_req(struct ath10k *ar, struct sk_buff *skb)
ath10k_dbg(ar, ATH10K_DBG_WMI, "WMI_VDEV_RESUME_REQ_EVENTID\n");
}

-static int ath10k_wmi_alloc_host_mem(struct ath10k *ar, u32 req_id,
- u32 num_units, u32 unit_len)
+static int ath10k_wmi_alloc_chunk(struct ath10k *ar, u32 req_id,
+ u32 num_units, u32 unit_len)
{
dma_addr_t paddr;
u32 pool_size;
int idx = ar->wmi.num_mem_chunks;
+ void *vaddr = NULL;

- pool_size = num_units * round_up(unit_len, 4);
+ if (ar->wmi.num_mem_chunks == ARRAY_SIZE(ar->wmi.mem_chunks))
+ return -ENOMEM;

- if (!pool_size)
- return -EINVAL;
+ while (!vaddr && num_units) {
+ pool_size = num_units * round_up(unit_len, 4);
+ if (!pool_size)
+ return -EINVAL;

- ar->wmi.mem_chunks[idx].vaddr = dma_alloc_coherent(ar->dev,
- pool_size,
- &paddr,
- GFP_KERNEL);
- if (!ar->wmi.mem_chunks[idx].vaddr) {
- ath10k_warn(ar, "failed to allocate memory chunk\n");
- return -ENOMEM;
+ vaddr = kzalloc(pool_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!vaddr)
+ num_units /= 2;
}

- memset(ar->wmi.mem_chunks[idx].vaddr, 0, pool_size);
+ if (!num_units)
+ return -ENOMEM;

+ paddr = dma_map_single(ar->dev, vaddr, pool_size, DMA_TO_DEVICE);
+ if (dma_mapping_error(ar->dev, paddr)) {
+ kfree(vaddr);
+ return -ENOMEM;
+ }
+
+ ar->wmi.mem_chunks[idx].vaddr = vaddr;
ar->wmi.mem_chunks[idx].paddr = paddr;
ar->wmi.mem_chunks[idx].len = pool_size;
ar->wmi.mem_chunks[idx].req_id = req_id;
ar->wmi.num_mem_chunks++;

+ return num_units;
+}
+
+static int ath10k_wmi_alloc_host_mem(struct ath10k *ar, u32 req_id,
+ u32 num_units, u32 unit_len)
+{
+ int ret;
+
+ while (num_units) {
+ ret = ath10k_wmi_alloc_chunk(ar, req_id, num_units, unit_len);
+ if (ret < 0)
+ return ret;
+
+ num_units -= ret;
+ }
+
return 0;
}

@@ -7705,10 +7729,11 @@ void ath10k_wmi_free_host_mem(struct ath10k *ar)

/* free the host memory chunks requested by firmware */
for (i = 0; i < ar->wmi.num_mem_chunks; i++) {
- dma_free_coherent(ar->dev,
- ar->wmi.mem_chunks[i].len,
- ar->wmi.mem_chunks[i].vaddr,
- ar->wmi.mem_chunks[i].paddr);
+ dma_unmap_single(ar->dev,
+ ar->wmi.mem_chunks[i].paddr,
+ ar->wmi.mem_chunks[i].len,
+ DMA_TO_DEVICE);
+ kfree(ar->wmi.mem_chunks[i].vaddr);
}

ar->wmi.num_mem_chunks = 0;
--
2.2.2



2015-11-23 13:38:16

by Felix Fietkau

[permalink] [raw]
Subject: [PATCH 2/2] ath10k: do not use coherent memory for tx buffers

Coherent memory is expensive to access, since all memory accesses bypass
the cache. It is also completely unnecessary for this case.
Convert to mapped memory instead and use the DMA API to flush the cache
where necessary.
Fixes allocation failures on embedded devices.

Signed-off-by: Felix Fietkau <[email protected]>
---
drivers/net/wireless/ath/ath10k/htt_tx.c | 77 +++++++++++++++++++++-----------
1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c
index 8f76b9d..99d9793 100644
--- a/drivers/net/wireless/ath/ath10k/htt_tx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_tx.c
@@ -100,7 +100,7 @@ void ath10k_htt_tx_free_msdu_id(struct ath10k_htt *htt, u16 msdu_id)
int ath10k_htt_tx_alloc(struct ath10k_htt *htt)
{
struct ath10k *ar = htt->ar;
- int ret, size;
+ int size;

ath10k_dbg(ar, ATH10K_DBG_BOOT, "htt tx max num pending tx %d\n",
htt->max_num_pending_tx);
@@ -109,39 +109,41 @@ int ath10k_htt_tx_alloc(struct ath10k_htt *htt)
idr_init(&htt->pending_tx);

size = htt->max_num_pending_tx * sizeof(struct ath10k_htt_txbuf);
- htt->txbuf.vaddr = dma_alloc_coherent(ar->dev, size,
- &htt->txbuf.paddr,
- GFP_DMA);
- if (!htt->txbuf.vaddr) {
- ath10k_err(ar, "failed to alloc tx buffer\n");
- ret = -ENOMEM;
+ htt->txbuf.vaddr = kzalloc(size, GFP_KERNEL);
+ if (!htt->txbuf.vaddr)
goto free_idr_pending_tx;
- }
+
+ htt->txbuf.paddr = dma_map_single(ar->dev, htt->txbuf.vaddr, size,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(ar->dev, htt->txbuf.paddr))
+ goto free_txbuf_vaddr;

if (!ar->hw_params.continuous_frag_desc)
- goto skip_frag_desc_alloc;
+ return 0;

size = htt->max_num_pending_tx * sizeof(struct htt_msdu_ext_desc);
- htt->frag_desc.vaddr = dma_alloc_coherent(ar->dev, size,
- &htt->frag_desc.paddr,
- GFP_DMA);
- if (!htt->frag_desc.vaddr) {
- ath10k_warn(ar, "failed to alloc fragment desc memory\n");
- ret = -ENOMEM;
+ htt->frag_desc.vaddr = kzalloc(size, GFP_KERNEL);
+ if (!htt->frag_desc.vaddr)
goto free_txbuf;
- }

-skip_frag_desc_alloc:
+ htt->frag_desc.paddr = dma_map_single(ar->dev, htt->frag_desc.vaddr,
+ size, DMA_TO_DEVICE);
+ if (dma_mapping_error(ar->dev, htt->txbuf.paddr))
+ goto free_frag_desc;
+
return 0;

+free_frag_desc:
+ kfree(htt->frag_desc.vaddr);
free_txbuf:
size = htt->max_num_pending_tx *
sizeof(struct ath10k_htt_txbuf);
- dma_free_coherent(htt->ar->dev, size, htt->txbuf.vaddr,
- htt->txbuf.paddr);
+ dma_unmap_single(htt->ar->dev, htt->txbuf.paddr, size, DMA_TO_DEVICE);
+free_txbuf_vaddr:
+ kfree(htt->txbuf.vaddr);
free_idr_pending_tx:
idr_destroy(&htt->pending_tx);
- return ret;
+ return -ENOMEM;
}

static int ath10k_htt_tx_clean_up_pending(int msdu_id, void *skb, void *ctx)
@@ -170,15 +172,17 @@ void ath10k_htt_tx_free(struct ath10k_htt *htt)
if (htt->txbuf.vaddr) {
size = htt->max_num_pending_tx *
sizeof(struct ath10k_htt_txbuf);
- dma_free_coherent(htt->ar->dev, size, htt->txbuf.vaddr,
- htt->txbuf.paddr);
+ dma_unmap_single(htt->ar->dev, htt->txbuf.paddr, size,
+ DMA_TO_DEVICE);
+ kfree(htt->txbuf.vaddr);
}

if (htt->frag_desc.vaddr) {
size = htt->max_num_pending_tx *
sizeof(struct htt_msdu_ext_desc);
- dma_free_coherent(htt->ar->dev, size, htt->frag_desc.vaddr,
- htt->frag_desc.paddr);
+ dma_unmap_single(htt->ar->dev, htt->frag_desc.paddr, size,
+ DMA_TO_DEVICE);
+ kfree(htt->frag_desc.vaddr);
}
}

@@ -550,6 +554,7 @@ int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *msdu)
struct htt_msdu_ext_desc *ext_desc = NULL;
bool limit_mgmt_desc = false;
bool is_probe_resp = false;
+ int txbuf_offset, frag_offset, frag_size;

if (unlikely(ieee80211_is_mgmt(hdr->frame_control)) &&
ar->hw_params.max_probe_resp_desc_thres) {
@@ -574,9 +579,11 @@ int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *msdu)
prefetch_len = min(htt->prefetch_len, msdu->len);
prefetch_len = roundup(prefetch_len, 4);

+ frag_size = sizeof(struct htt_msdu_ext_desc);
+ frag_offset = frag_size * msdu_id;
+ txbuf_offset = sizeof(struct ath10k_htt_txbuf) * msdu_id;
skb_cb->htt.txbuf = &htt->txbuf.vaddr[msdu_id];
- skb_cb->htt.txbuf_paddr = htt->txbuf.paddr +
- (sizeof(struct ath10k_htt_txbuf) * msdu_id);
+ skb_cb->htt.txbuf_paddr = htt->txbuf.paddr + txbuf_offset;

if ((ieee80211_is_action(hdr->frame_control) ||
ieee80211_is_deauth(hdr->frame_control) ||
@@ -597,6 +604,15 @@ int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *msdu)
goto err_free_msdu_id;
}

+ dma_sync_single_range_for_cpu(dev, htt->txbuf.paddr, txbuf_offset,
+ sizeof(struct ath10k_htt_txbuf),
+ DMA_TO_DEVICE);
+
+ if (ar->hw_params.continuous_frag_desc)
+ dma_sync_single_range_for_cpu(dev, htt->frag_desc.paddr,
+ frag_offset, frag_size,
+ DMA_TO_DEVICE);
+
switch (skb_cb->txmode) {
case ATH10K_HW_TXRX_RAW:
case ATH10K_HW_TXRX_NATIVE_WIFI:
@@ -723,6 +739,15 @@ int ath10k_htt_tx(struct ath10k_htt *htt, struct sk_buff *msdu)
sg_items[1].paddr = skb_cb->paddr;
sg_items[1].len = prefetch_len;

+ if (ar->hw_params.continuous_frag_desc)
+ dma_sync_single_range_for_device(dev, htt->frag_desc.paddr,
+ frag_offset, frag_size,
+ DMA_TO_DEVICE);
+
+ dma_sync_single_range_for_device(dev, htt->txbuf.paddr, txbuf_offset,
+ sizeof(struct ath10k_htt_txbuf),
+ DMA_TO_DEVICE);
+
res = ath10k_hif_tx_sg(htt->ar,
htt->ar->htc.endpoint[htt->eid].ul_pipe_id,
sg_items, ARRAY_SIZE(sg_items));
--
2.2.2


2015-11-30 10:29:44

by Kalle Valo

[permalink] [raw]
Subject: Re: [PATCH 1/2] ath10k: do not use coherent memory for allocated device memory chunks

Felix Fietkau <[email protected]> writes:

> Coherent memory is more expensive to allocate (and constrained on some
> architectures where it has to be pre-allocated). It is also completely
> unnecessary, since the host has no reason to even access these allocated
> memory spaces
>
> Signed-off-by: Felix Fietkau <[email protected]>

I see a new compiler warning:

drivers/net/wireless/ath/ath10k/wmi.c: In function
'ath10k_wmi_event_service_ready_work':
drivers/net/wireless/ath/ath10k/wmi.c:4347:30: warning: 'pool_size' may
be used uninitialized in this function [-Wuninitialized]
drivers/net/wireless/ath/ath10k/wmi.c:4319:6: note: 'pool_size' was
declared here

Seems to be false and maybe because my gcc is pretty old:

gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3

But anyway it would be nice if this can be avoided.

--
Kalle Valo