This patchset covers optimizations in rx (first 7 patches)
and tx (remaining 5 patches) data path.
Running UDP DL/UL traffic on IPQ8074 5G radio showed an average 5-10%
improvement on a 4 core platform.
P Praneesh (12):
ath11k: disable unused CE8 interrupts for ipq8074
ath11k: allocate dst ring descriptors from cacheable memory
ath11k: modify dp_rx desc access wrapper calls inline
ath11k: avoid additional access to ath11k_hal_srng_dst_num_free
ath11k: avoid active pdev check for each msdu
ath11k: remove usage quota while processing rx packets
ath11k: add branch predictors in process_rx
ath11k: allocate HAL_WBM2SW_RELEASE ring from cacheable memory
ath11k: remove mod operator in dst ring processing
ath11k: avoid while loop in ring selection of tx completion interrupt
ath11k: add branch predictors in dp_tx path
ath11k: avoid unnecessary lock contention in tx_completion path
drivers/net/wireless/ath/ath11k/ce.c | 2 +-
drivers/net/wireless/ath/ath11k/dp.c | 45 +++++--
drivers/net/wireless/ath/ath11k/dp.h | 1 +
drivers/net/wireless/ath/ath11k/dp_rx.c | 205 ++++++++++++++++----------------
drivers/net/wireless/ath/ath11k/dp_tx.c | 63 +++++-----
drivers/net/wireless/ath/ath11k/hal.c | 32 ++++-
drivers/net/wireless/ath/ath11k/hal.h | 1 +
drivers/net/wireless/ath/ath11k/mac.c | 2 +-
8 files changed, 202 insertions(+), 149 deletions(-)
--
2.25.1
From: P Praneesh <[email protected]>
Host driver doesn't need to process CE8 interrupts (used
by target independently)
The volume of interrupts is huge within short interval,
CPU0 CPU1 CPU2 CPU3
14022188 0 0 0 GIC 71 Edge ce8
Hence disabling unused CE8 interrupt will improve CPU usage.
Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
Co-developed-by: Sriram R <[email protected]>
Signed-off-by: Sriram R <[email protected]>
Signed-off-by: P Praneesh <[email protected]>
Signed-off-by: Jouni Malinen <[email protected]>
---
drivers/net/wireless/ath/ath11k/ce.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ath/ath11k/ce.c b/drivers/net/wireless/ath/ath11k/ce.c
index de8b632b058c..b6ffe032855f 100644
--- a/drivers/net/wireless/ath/ath11k/ce.c
+++ b/drivers/net/wireless/ath/ath11k/ce.c
@@ -77,7 +77,7 @@ const struct ce_attr ath11k_host_ce_config_ipq8074[] = {
/* CE8: target autonomous hif_memcpy */
{
- .flags = CE_ATTR_FLAGS,
+ .flags = CE_ATTR_FLAGS | CE_ATTR_DIS_INTR,
.src_nentries = 0,
.src_sz_max = 0,
.dest_nentries = 0,
--
2.25.1
From: P Praneesh <[email protected]>
tcl_data and reo_dst rings are currently being allocated
using dma_allocate_coherent() which is non cachable.
Allocating ring memory from cacheable memory area
allows cached descriptor access and prefetch next
descriptors to optimize CPU usage during
descriptor processing on NAPI.
Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
Co-developed-by: Pradeep Kumar Chitrapu <[email protected]>
Signed-off-by: Pradeep Kumar Chitrapu <[email protected]>
Co-developed-by: Sriram R <[email protected]>
Signed-off-by: Sriram R <[email protected]>
Signed-off-by: P Praneesh <[email protected]>
Signed-off-by: Jouni Malinen <[email protected]>
---
drivers/net/wireless/ath/ath11k/dp.c | 34 +++++++++++++++++++++++----
drivers/net/wireless/ath/ath11k/dp.h | 1 +
drivers/net/wireless/ath/ath11k/hal.c | 25 ++++++++++++++++++--
drivers/net/wireless/ath/ath11k/hal.h | 1 +
4 files changed, 54 insertions(+), 7 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/dp.c b/drivers/net/wireless/ath/ath11k/dp.c
index b0c8f6290099..cf869ebc209a 100644
--- a/drivers/net/wireless/ath/ath11k/dp.c
+++ b/drivers/net/wireless/ath/ath11k/dp.c
@@ -101,8 +101,11 @@ void ath11k_dp_srng_cleanup(struct ath11k_base *ab, struct dp_srng *ring)
if (!ring->vaddr_unaligned)
return;
- dma_free_coherent(ab->dev, ring->size, ring->vaddr_unaligned,
- ring->paddr_unaligned);
+ if (ring->cached)
+ kfree(ring->vaddr_unaligned);
+ else
+ dma_free_coherent(ab->dev, ring->size, ring->vaddr_unaligned,
+ ring->paddr_unaligned);
ring->vaddr_unaligned = NULL;
}
@@ -222,6 +225,7 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab, struct dp_srng *ring,
int entry_sz = ath11k_hal_srng_get_entrysize(ab, type);
int max_entries = ath11k_hal_srng_get_max_entries(ab, type);
int ret;
+ bool cached;
if (max_entries < 0 || entry_sz < 0)
return -EINVAL;
@@ -229,10 +233,25 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab, struct dp_srng *ring,
if (num_entries > max_entries)
num_entries = max_entries;
+ /* Allocate the reo dst and tx completion rings from cacheable memory */
+ switch (type) {
+ case HAL_REO_DST:
+ cached = true;
+ default:
+ cached = false;
+ }
+
ring->size = (num_entries * entry_sz) + HAL_RING_BASE_ALIGN - 1;
- ring->vaddr_unaligned = dma_alloc_coherent(ab->dev, ring->size,
- &ring->paddr_unaligned,
- GFP_KERNEL);
+
+ if (cached) {
+ ring->vaddr_unaligned = kzalloc(ring->size, GFP_KERNEL);
+ ring->paddr_unaligned = virt_to_phys(ring->vaddr_unaligned);
+ } else {
+ ring->vaddr_unaligned = dma_alloc_coherent(ab->dev, ring->size,
+ &ring->paddr_unaligned,
+ GFP_KERNEL);
+ }
+
if (!ring->vaddr_unaligned)
return -ENOMEM;
@@ -292,6 +311,11 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab, struct dp_srng *ring,
return -EINVAL;
}
+ if (cached) {
+ params.flags |= HAL_SRNG_FLAGS_CACHED;
+ ring->cached = 1;
+ }
+
ret = ath11k_hal_srng_setup(ab, type, ring_num, mac_id, ¶ms);
if (ret < 0) {
ath11k_warn(ab, "failed to setup srng: %d ring_id %d\n",
diff --git a/drivers/net/wireless/ath/ath11k/dp.h b/drivers/net/wireless/ath/ath11k/dp.h
index ee768ccce46e..e6591488a28c 100644
--- a/drivers/net/wireless/ath/ath11k/dp.h
+++ b/drivers/net/wireless/ath/ath11k/dp.h
@@ -64,6 +64,7 @@ struct dp_srng {
dma_addr_t paddr;
int size;
u32 ring_id;
+ u8 cached;
};
struct dp_rxdma_ring {
diff --git a/drivers/net/wireless/ath/ath11k/hal.c b/drivers/net/wireless/ath/ath11k/hal.c
index eaa0edca5576..a58e86e42b5b 100644
--- a/drivers/net/wireless/ath/ath11k/hal.c
+++ b/drivers/net/wireless/ath/ath11k/hal.c
@@ -627,6 +627,21 @@ u32 *ath11k_hal_srng_dst_peek(struct ath11k_base *ab, struct hal_srng *srng)
return NULL;
}
+static void ath11k_hal_srng_prefetch_desc(struct ath11k_base *ab,
+ struct hal_srng *srng)
+{
+ u32 *desc;
+
+ /* prefetch only if desc is available */
+ desc = ath11k_hal_srng_dst_peek(ab, srng);
+ if (likely(desc)) {
+ dma_sync_single_for_cpu(ab->dev, virt_to_phys(desc),
+ (srng->entry_size * sizeof(u32)),
+ DMA_FROM_DEVICE);
+ prefetch(desc);
+ }
+}
+
u32 *ath11k_hal_srng_dst_get_next_entry(struct ath11k_base *ab,
struct hal_srng *srng)
{
@@ -642,6 +657,10 @@ u32 *ath11k_hal_srng_dst_get_next_entry(struct ath11k_base *ab,
srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size) %
srng->ring_size;
+ /* Try to prefetch the next descriptor in the ring */
+ if (srng->flags & HAL_SRNG_FLAGS_CACHED)
+ ath11k_hal_srng_prefetch_desc(ab, srng);
+
return desc;
}
@@ -775,11 +794,13 @@ void ath11k_hal_srng_access_begin(struct ath11k_base *ab, struct hal_srng *srng)
{
lockdep_assert_held(&srng->lock);
- if (srng->ring_dir == HAL_SRNG_DIR_SRC)
+ if (srng->ring_dir == HAL_SRNG_DIR_SRC) {
srng->u.src_ring.cached_tp =
*(volatile u32 *)srng->u.src_ring.tp_addr;
- else
+ } else {
srng->u.dst_ring.cached_hp = *srng->u.dst_ring.hp_addr;
+ ath11k_hal_srng_prefetch_desc(ab, srng);
+ }
}
/* Update cached ring head/tail pointers to HW. ath11k_hal_srng_access_begin()
diff --git a/drivers/net/wireless/ath/ath11k/hal.h b/drivers/net/wireless/ath/ath11k/hal.h
index 35ed3a14e200..0f4f9ce74354 100644
--- a/drivers/net/wireless/ath/ath11k/hal.h
+++ b/drivers/net/wireless/ath/ath11k/hal.h
@@ -513,6 +513,7 @@ enum hal_srng_dir {
#define HAL_SRNG_FLAGS_DATA_TLV_SWAP 0x00000020
#define HAL_SRNG_FLAGS_LOW_THRESH_INTR_EN 0x00010000
#define HAL_SRNG_FLAGS_MSI_INTR 0x00020000
+#define HAL_SRNG_FLAGS_CACHED 0x20000000
#define HAL_SRNG_FLAGS_LMAC_RING 0x80000000
#define HAL_SRNG_TLV_HDR_TAG GENMASK(9, 1)
--
2.25.1
From: P Praneesh <[email protected]>
In data path, to reduce the CPU cycles spending on descriptor access
wrapper function, changed those functions as static inline.
Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
Co-developed-by: Sriram R <[email protected]>
Signed-off-by: Sriram R <[email protected]>
Signed-off-by: P Praneesh <[email protected]>
Signed-off-by: Jouni Malinen <[email protected]>
---
drivers/net/wireless/ath/ath11k/dp_rx.c | 114 ++++++++++++------------
1 file changed, 59 insertions(+), 55 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c
index 603d2f93ac18..32c348812769 100644
--- a/drivers/net/wireless/ath/ath11k/dp_rx.c
+++ b/drivers/net/wireless/ath/ath11k/dp_rx.c
@@ -20,13 +20,15 @@
#define ATH11K_DP_RX_FRAGMENT_TIMEOUT_MS (2 * HZ)
-static u8 *ath11k_dp_rx_h_80211_hdr(struct ath11k_base *ab, struct hal_rx_desc *desc)
+static inline
+u8 *ath11k_dp_rx_h_80211_hdr(struct ath11k_base *ab, struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_hdr_status(desc);
}
-static enum hal_encrypt_type ath11k_dp_rx_h_mpdu_start_enctype(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline
+enum hal_encrypt_type ath11k_dp_rx_h_mpdu_start_enctype(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
if (!ab->hw_params.hw_ops->rx_desc_encrypt_valid(desc))
return HAL_ENCRYPT_TYPE_OPEN;
@@ -34,32 +36,34 @@ static enum hal_encrypt_type ath11k_dp_rx_h_mpdu_start_enctype(struct ath11k_bas
return ab->hw_params.hw_ops->rx_desc_get_encrypt_type(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_decap_type(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_decap_type(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_decap_type(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_mesh_ctl_present(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline
+u8 ath11k_dp_rx_h_msdu_start_mesh_ctl_present(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mesh_ctl(desc);
}
-static bool ath11k_dp_rx_h_mpdu_start_seq_ctrl_valid(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline
+bool ath11k_dp_rx_h_mpdu_start_seq_ctrl_valid(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_seq_ctl_vld(desc);
}
-static bool ath11k_dp_rx_h_mpdu_start_fc_valid(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline bool ath11k_dp_rx_h_mpdu_start_fc_valid(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_fc_valid(desc);
}
-static bool ath11k_dp_rx_h_mpdu_start_more_frags(struct ath11k_base *ab,
- struct sk_buff *skb)
+static inline bool ath11k_dp_rx_h_mpdu_start_more_frags(struct ath11k_base *ab,
+ struct sk_buff *skb)
{
struct ieee80211_hdr *hdr;
@@ -67,8 +71,8 @@ static bool ath11k_dp_rx_h_mpdu_start_more_frags(struct ath11k_base *ab,
return ieee80211_has_morefrags(hdr->frame_control);
}
-static u16 ath11k_dp_rx_h_mpdu_start_frag_no(struct ath11k_base *ab,
- struct sk_buff *skb)
+static inline u16 ath11k_dp_rx_h_mpdu_start_frag_no(struct ath11k_base *ab,
+ struct sk_buff *skb)
{
struct ieee80211_hdr *hdr;
@@ -76,37 +80,37 @@ static u16 ath11k_dp_rx_h_mpdu_start_frag_no(struct ath11k_base *ab,
return le16_to_cpu(hdr->seq_ctrl) & IEEE80211_SCTL_FRAG;
}
-static u16 ath11k_dp_rx_h_mpdu_start_seq_no(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u16 ath11k_dp_rx_h_mpdu_start_seq_no(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_start_seq_no(desc);
}
-static void *ath11k_dp_rx_get_attention(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline void *ath11k_dp_rx_get_attention(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_attention(desc);
}
-static bool ath11k_dp_rx_h_attn_msdu_done(struct rx_attention *attn)
+static inline bool ath11k_dp_rx_h_attn_msdu_done(struct rx_attention *attn)
{
return !!FIELD_GET(RX_ATTENTION_INFO2_MSDU_DONE,
__le32_to_cpu(attn->info2));
}
-static bool ath11k_dp_rx_h_attn_l4_cksum_fail(struct rx_attention *attn)
+static inline bool ath11k_dp_rx_h_attn_l4_cksum_fail(struct rx_attention *attn)
{
return !!FIELD_GET(RX_ATTENTION_INFO1_TCP_UDP_CKSUM_FAIL,
__le32_to_cpu(attn->info1));
}
-static bool ath11k_dp_rx_h_attn_ip_cksum_fail(struct rx_attention *attn)
+static inline bool ath11k_dp_rx_h_attn_ip_cksum_fail(struct rx_attention *attn)
{
return !!FIELD_GET(RX_ATTENTION_INFO1_IP_CKSUM_FAIL,
__le32_to_cpu(attn->info1));
}
-static bool ath11k_dp_rx_h_attn_is_decrypted(struct rx_attention *attn)
+static inline bool ath11k_dp_rx_h_attn_is_decrypted(struct rx_attention *attn)
{
return (FIELD_GET(RX_ATTENTION_INFO2_DCRYPT_STATUS_CODE,
__le32_to_cpu(attn->info2)) ==
@@ -142,68 +146,68 @@ static u32 ath11k_dp_rx_h_attn_mpdu_err(struct rx_attention *attn)
return errmap;
}
-static u16 ath11k_dp_rx_h_msdu_start_msdu_len(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u16 ath11k_dp_rx_h_msdu_start_msdu_len(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_len(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_sgi(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_sgi(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_sgi(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_rate_mcs(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_rate_mcs(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_rate_mcs(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_rx_bw(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_rx_bw(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_rx_bw(desc);
}
-static u32 ath11k_dp_rx_h_msdu_start_freq(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u32 ath11k_dp_rx_h_msdu_start_freq(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_freq(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_pkt_type(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_pkt_type(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_msdu_pkt_type(desc);
}
-static u8 ath11k_dp_rx_h_msdu_start_nss(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_start_nss(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return hweight8(ab->hw_params.hw_ops->rx_desc_get_msdu_nss(desc));
}
-static u8 ath11k_dp_rx_h_mpdu_start_tid(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_mpdu_start_tid(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_tid(desc);
}
-static u16 ath11k_dp_rx_h_mpdu_start_peer_id(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u16 ath11k_dp_rx_h_mpdu_start_peer_id(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_peer_id(desc);
}
-static u8 ath11k_dp_rx_h_msdu_end_l3pad(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline u8 ath11k_dp_rx_h_msdu_end_l3pad(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_l3_pad_bytes(desc);
}
-static bool ath11k_dp_rx_h_msdu_end_first_msdu(struct ath11k_base *ab,
- struct hal_rx_desc *desc)
+static inline bool ath11k_dp_rx_h_msdu_end_first_msdu(struct ath11k_base *ab,
+ struct hal_rx_desc *desc)
{
return ab->hw_params.hw_ops->rx_desc_get_first_msdu(desc);
}
@@ -221,14 +225,14 @@ static void ath11k_dp_rx_desc_end_tlv_copy(struct ath11k_base *ab,
ab->hw_params.hw_ops->rx_desc_copy_attn_end_tlv(fdesc, ldesc);
}
-static u32 ath11k_dp_rxdesc_get_mpdulen_err(struct rx_attention *attn)
+static inline u32 ath11k_dp_rxdesc_get_mpdulen_err(struct rx_attention *attn)
{
return FIELD_GET(RX_ATTENTION_INFO1_MPDU_LEN_ERR,
__le32_to_cpu(attn->info1));
}
-static u8 *ath11k_dp_rxdesc_get_80211hdr(struct ath11k_base *ab,
- struct hal_rx_desc *rx_desc)
+static inline u8 *ath11k_dp_rxdesc_get_80211hdr(struct ath11k_base *ab,
+ struct hal_rx_desc *rx_desc)
{
u8 *rx_pkt_hdr;
@@ -237,8 +241,8 @@ static u8 *ath11k_dp_rxdesc_get_80211hdr(struct ath11k_base *ab,
return rx_pkt_hdr;
}
-static bool ath11k_dp_rxdesc_mpdu_valid(struct ath11k_base *ab,
- struct hal_rx_desc *rx_desc)
+static inline bool ath11k_dp_rxdesc_mpdu_valid(struct ath11k_base *ab,
+ struct hal_rx_desc *rx_desc)
{
u32 tlv_tag;
@@ -247,15 +251,15 @@ static bool ath11k_dp_rxdesc_mpdu_valid(struct ath11k_base *ab,
return tlv_tag == HAL_RX_MPDU_START;
}
-static u32 ath11k_dp_rxdesc_get_ppduid(struct ath11k_base *ab,
- struct hal_rx_desc *rx_desc)
+static inline u32 ath11k_dp_rxdesc_get_ppduid(struct ath11k_base *ab,
+ struct hal_rx_desc *rx_desc)
{
return ab->hw_params.hw_ops->rx_desc_get_mpdu_ppdu_id(rx_desc);
}
-static void ath11k_dp_rxdesc_set_msdu_len(struct ath11k_base *ab,
- struct hal_rx_desc *desc,
- u16 len)
+static inline void ath11k_dp_rxdesc_set_msdu_len(struct ath11k_base *ab,
+ struct hal_rx_desc *desc,
+ u16 len)
{
ab->hw_params.hw_ops->rx_desc_set_msdu_len(desc, len);
}
--
2.25.1
From: P Praneesh <[email protected]>
Add branch prediciton in dp_tx code path in tx and tx completion
handler.
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01734-QCAHKSWPL_SILICONZ-1 v2
Co-developed-by: Sriram R <[email protected]>
Signed-off-by: Sriram R <[email protected]>
Signed-off-by: P Praneesh <[email protected]>
Signed-off-by: Jouni Malinen <[email protected]>
---
drivers/net/wireless/ath/ath11k/dp_tx.c | 45 +++++++++++++------------
drivers/net/wireless/ath/ath11k/mac.c | 2 +-
2 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/dp_tx.c b/drivers/net/wireless/ath/ath11k/dp_tx.c
index 8bba5234f81f..ab9ccf0eb274 100644
--- a/drivers/net/wireless/ath/ath11k/dp_tx.c
+++ b/drivers/net/wireless/ath/ath11k/dp_tx.c
@@ -95,11 +95,11 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
u8 ring_selector = 0, ring_map = 0;
bool tcl_ring_retry;
- if (test_bit(ATH11K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags))
+ if (unlikely(test_bit(ATH11K_FLAG_CRASH_FLUSH, &ar->ab->dev_flags)))
return -ESHUTDOWN;
- if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
- !ieee80211_is_data(hdr->frame_control))
+ if (unlikely(!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
+ !ieee80211_is_data(hdr->frame_control)))
return -ENOTSUPP;
pool_id = skb_get_queue_mapping(skb) & (ATH11K_HW_MAX_QUEUES - 1);
@@ -130,7 +130,7 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
DP_TX_IDR_SIZE - 1, GFP_ATOMIC);
spin_unlock_bh(&tx_ring->tx_idr_lock);
- if (ret < 0) {
+ if (unlikely(ret < 0)) {
if (ring_map == (BIT(DP_TCL_NUM_RING_MAX) - 1)) {
atomic_inc(&ab->soc_stats.tx_err.misc_fail);
return -ENOSPC;
@@ -147,7 +147,7 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
ti.encap_type = ath11k_dp_tx_get_encap_type(arvif, skb);
ti.meta_data_flags = arvif->tcl_metadata;
- if (ti.encap_type == HAL_TCL_ENCAP_TYPE_RAW) {
+ if (unlikely(ti.encap_type == HAL_TCL_ENCAP_TYPE_RAW)) {
if (skb_cb->flags & ATH11K_SKB_CIPHER_SET) {
ti.encrypt_type =
ath11k_dp_tx_get_encrypt_type(skb_cb->cipher);
@@ -168,8 +168,8 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
ti.bss_ast_idx = arvif->ast_idx;
ti.dscp_tid_tbl_idx = 0;
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- ti.encap_type != HAL_TCL_ENCAP_TYPE_RAW) {
+ if (likely(skb->ip_summed == CHECKSUM_PARTIAL &&
+ ti.encap_type != HAL_TCL_ENCAP_TYPE_RAW)) {
ti.flags0 |= FIELD_PREP(HAL_TCL_DATA_CMD_INFO1_IP4_CKSUM_EN, 1) |
FIELD_PREP(HAL_TCL_DATA_CMD_INFO1_UDP4_CKSUM_EN, 1) |
FIELD_PREP(HAL_TCL_DATA_CMD_INFO1_UDP6_CKSUM_EN, 1) |
@@ -206,7 +206,7 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
}
ti.paddr = dma_map_single(ab->dev, skb->data, skb->len, DMA_TO_DEVICE);
- if (dma_mapping_error(ab->dev, ti.paddr)) {
+ if (unlikely(dma_mapping_error(ab->dev, ti.paddr))) {
atomic_inc(&ab->soc_stats.tx_err.misc_fail);
ath11k_warn(ab, "failed to DMA map data Tx buffer\n");
ret = -ENOMEM;
@@ -226,7 +226,7 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
ath11k_hal_srng_access_begin(ab, tcl_ring);
hal_tcl_desc = (void *)ath11k_hal_srng_src_get_next_entry(ab, tcl_ring);
- if (!hal_tcl_desc) {
+ if (unlikely(!hal_tcl_desc)) {
/* NOTE: It is highly unlikely we'll be running out of tcl_ring
* desc because the desc is directly enqueued onto hw queue.
*/
@@ -240,8 +240,8 @@ int ath11k_dp_tx(struct ath11k *ar, struct ath11k_vif *arvif,
* checking this ring earlier for each pkt tx.
* Restart ring selection if some rings are not checked yet.
*/
- if (ring_map != (BIT(DP_TCL_NUM_RING_MAX) - 1) &&
- !ar->ab->hw_params.tcl_0_only) {
+ if (unlikely(ring_map != (BIT(DP_TCL_NUM_RING_MAX) - 1) &&
+ !ar->ab->hw_params.tcl_0_only)) {
tcl_ring_retry = true;
ring_selector++;
}
@@ -322,7 +322,7 @@ ath11k_dp_tx_htt_tx_complete_buf(struct ath11k_base *ab,
spin_lock_bh(&tx_ring->tx_idr_lock);
msdu = idr_find(&tx_ring->txbuf_idr, ts->msdu_id);
- if (!msdu) {
+ if (unlikely(!msdu)) {
ath11k_warn(ab, "htt tx completion for unknown msdu_id %d\n",
ts->msdu_id);
spin_unlock_bh(&tx_ring->tx_idr_lock);
@@ -432,12 +432,12 @@ static void ath11k_dp_tx_complete_msdu(struct ath11k *ar,
rcu_read_lock();
- if (!rcu_dereference(ab->pdevs_active[ar->pdev_idx])) {
+ if (unlikely(!rcu_dereference(ab->pdevs_active[ar->pdev_idx]))) {
dev_kfree_skb_any(msdu);
goto exit;
}
- if (!skb_cb->vif) {
+ if (unlikely(!skb_cb->vif)) {
dev_kfree_skb_any(msdu);
goto exit;
}
@@ -460,7 +460,7 @@ static void ath11k_dp_tx_complete_msdu(struct ath11k *ar,
(info->flags & IEEE80211_TX_CTL_NO_ACK))
info->flags |= IEEE80211_TX_STAT_NOACK_TRANSMITTED;
- if (ath11k_debugfs_is_extd_tx_stats_enabled(ar)) {
+ if (unlikely(ath11k_debugfs_is_extd_tx_stats_enabled(ar))) {
if (ts->flags & HAL_TX_STATUS_FLAGS_FIRST_MSDU) {
if (ar->last_ppdu_id == 0) {
ar->last_ppdu_id = ts->ppdu_id;
@@ -500,11 +500,11 @@ static inline void ath11k_dp_tx_status_parse(struct ath11k_base *ab,
{
ts->buf_rel_source =
FIELD_GET(HAL_WBM_RELEASE_INFO0_REL_SRC_MODULE, desc->info0);
- if (ts->buf_rel_source != HAL_WBM_REL_SRC_MODULE_FW &&
- ts->buf_rel_source != HAL_WBM_REL_SRC_MODULE_TQM)
+ if (unlikely(ts->buf_rel_source != HAL_WBM_REL_SRC_MODULE_FW &&
+ ts->buf_rel_source != HAL_WBM_REL_SRC_MODULE_TQM))
return;
- if (ts->buf_rel_source == HAL_WBM_REL_SRC_MODULE_FW)
+ if (unlikely(ts->buf_rel_source == HAL_WBM_REL_SRC_MODULE_FW))
return;
ts->status = FIELD_GET(HAL_WBM_RELEASE_INFO0_TQM_RELEASE_REASON,
@@ -551,8 +551,9 @@ void ath11k_dp_tx_completion_handler(struct ath11k_base *ab, int ring_id)
ATH11K_TX_COMPL_NEXT(tx_ring->tx_status_head);
}
- if ((ath11k_hal_srng_dst_peek(ab, status_ring) != NULL) &&
- (ATH11K_TX_COMPL_NEXT(tx_ring->tx_status_head) == tx_ring->tx_status_tail)) {
+ if (unlikely((ath11k_hal_srng_dst_peek(ab, status_ring) != NULL) &&
+ (ATH11K_TX_COMPL_NEXT(tx_ring->tx_status_head) ==
+ tx_ring->tx_status_tail))) {
/* TODO: Process pending tx_status messages when kfifo_is_full() */
ath11k_warn(ab, "Unable to process some of the tx_status ring desc because status_fifo is full\n");
}
@@ -575,7 +576,7 @@ void ath11k_dp_tx_completion_handler(struct ath11k_base *ab, int ring_id)
mac_id = FIELD_GET(DP_TX_DESC_ID_MAC_ID, desc_id);
msdu_id = FIELD_GET(DP_TX_DESC_ID_MSDU_ID, desc_id);
- if (ts.buf_rel_source == HAL_WBM_REL_SRC_MODULE_FW) {
+ if (unlikely(ts.buf_rel_source == HAL_WBM_REL_SRC_MODULE_FW)) {
ath11k_dp_tx_process_htt_tx_complete(ab,
(void *)tx_status,
mac_id, msdu_id,
@@ -585,7 +586,7 @@ void ath11k_dp_tx_completion_handler(struct ath11k_base *ab, int ring_id)
spin_lock_bh(&tx_ring->tx_idr_lock);
msdu = idr_find(&tx_ring->txbuf_idr, msdu_id);
- if (!msdu) {
+ if (unlikely(!msdu)) {
ath11k_warn(ab, "tx completion for unknown msdu_id %d\n",
msdu_id);
spin_unlock_bh(&tx_ring->tx_idr_lock);
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index eb52332dbe3f..32e90b13fac0 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -4333,7 +4333,7 @@ static void ath11k_mac_op_tx(struct ieee80211_hw *hw,
}
ret = ath11k_dp_tx(ar, arvif, skb);
- if (ret) {
+ if (unlikely(ret)) {
ath11k_warn(ar->ab, "failed to transmit frame %d\n", ret);
ieee80211_free_txskb(ar->hw, skb);
}
--
2.25.1
From: P Praneesh <[email protected]>
In datapath, add branch predictors where required in the process rx().
This protects high value rx path without having performance overhead.
Tested-on: QCN9074 hw1.0 PCI WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
Co-developed-by: Sriram R <[email protected]>
Signed-off-by: Sriram R <[email protected]>
Signed-off-by: P Praneesh <[email protected]>
Signed-off-by: Jouni Malinen <[email protected]>
---
drivers/net/wireless/ath/ath11k/dp_rx.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c
index 3ece90d890e5..6e9338f1b4db 100644
--- a/drivers/net/wireless/ath/ath11k/dp_rx.c
+++ b/drivers/net/wireless/ath/ath11k/dp_rx.c
@@ -2540,13 +2540,13 @@ static void ath11k_dp_rx_process_received_packets(struct ath11k_base *ab,
rcu_read_lock();
ar = ab->pdevs[mac_id].ar;
- if (!rcu_dereference(ab->pdevs_active[mac_id])) {
+ if (unlikely(!rcu_dereference(ab->pdevs_active[mac_id]))) {
__skb_queue_purge(msdu_list);
rcu_read_unlock();
return;
}
- if (test_bit(ATH11K_CAC_RUNNING, &ar->dev_flags)) {
+ if (unlikely(test_bit(ATH11K_CAC_RUNNING, &ar->dev_flags))) {
__skb_queue_purge(msdu_list);
rcu_read_unlock();
return;
@@ -2554,7 +2554,7 @@ static void ath11k_dp_rx_process_received_packets(struct ath11k_base *ab,
while ((msdu = __skb_dequeue(msdu_list))) {
ret = ath11k_dp_rx_process_msdu(ar, msdu, msdu_list);
- if (ret) {
+ if (unlikely(ret)) {
ath11k_dbg(ab, ATH11K_DBG_DATA,
"Unable to process msdu %d", ret);
dev_kfree_skb_any(msdu);
@@ -2609,7 +2609,7 @@ int ath11k_dp_process_rx(struct ath11k_base *ab, int ring_id,
rx_ring = &ar->dp.rx_refill_buf_ring;
spin_lock_bh(&rx_ring->idr_lock);
msdu = idr_find(&rx_ring->bufs_idr, buf_id);
- if (!msdu) {
+ if (unlikely(!msdu)) {
ath11k_warn(ab, "frame rx with invalid buf_id %d\n",
buf_id);
spin_unlock_bh(&rx_ring->idr_lock);
@@ -2628,8 +2628,8 @@ int ath11k_dp_process_rx(struct ath11k_base *ab, int ring_id,
push_reason = FIELD_GET(HAL_REO_DEST_RING_INFO0_PUSH_REASON,
desc->info0);
- if (push_reason !=
- HAL_REO_DEST_RING_PUSH_REASON_ROUTING_INSTRUCTION) {
+ if (unlikely(push_reason !=
+ HAL_REO_DEST_RING_PUSH_REASON_ROUTING_INSTRUCTION)) {
dev_kfree_skb_any(msdu);
ab->soc_stats.hal_reo_error[dp->reo_dst_ring[ring_id].ring_id]++;
continue;
@@ -2664,7 +2664,7 @@ int ath11k_dp_process_rx(struct ath11k_base *ab, int ring_id,
* head pointer so that we can reap complete MPDU in the current
* rx processing.
*/
- if (!done && ath11k_hal_srng_dst_num_free(ab, srng, true)) {
+ if (unlikely(!done && ath11k_hal_srng_dst_num_free(ab, srng, true))) {
ath11k_hal_srng_access_end(ab, srng);
goto try_again;
}
@@ -2673,7 +2673,7 @@ int ath11k_dp_process_rx(struct ath11k_base *ab, int ring_id,
spin_unlock_bh(&srng->lock);
- if (!total_msdu_reaped)
+ if (unlikely(!total_msdu_reaped))
goto exit;
for (i = 0; i < ab->num_radios; i++) {
--
2.25.1
On 2021-06-15 14:13, Jouni Malinen wrote:
> From: P Praneesh <[email protected]>
>
> tcl_data and reo_dst rings are currently being allocated
> using dma_allocate_coherent() which is non cachable.
>
> Allocating ring memory from cacheable memory area
> allows cached descriptor access and prefetch next
> descriptors to optimize CPU usage during
> descriptor processing on NAPI.
>
> Tested-on: QCN9074 hw1.0 PCI
> WLAN.HK.2.4.0.1.r2-00012-QCAHKSWPL_SILICONZ-1
> Tested-on: IPQ8074 hw2.0 AHB WLAN.HK.2.4.0.1-01695-QCAHKSWPL_SILICONZ-1
>
> Co-developed-by: Pradeep Kumar Chitrapu <[email protected]>
> Signed-off-by: Pradeep Kumar Chitrapu <[email protected]>
> Co-developed-by: Sriram R <[email protected]>
> Signed-off-by: Sriram R <[email protected]>
> Signed-off-by: P Praneesh <[email protected]>
> Signed-off-by: Jouni Malinen <[email protected]>
> ---
> drivers/net/wireless/ath/ath11k/dp.c | 34 +++++++++++++++++++++++----
> drivers/net/wireless/ath/ath11k/dp.h | 1 +
> drivers/net/wireless/ath/ath11k/hal.c | 25 ++++++++++++++++++--
> drivers/net/wireless/ath/ath11k/hal.h | 1 +
> 4 files changed, 54 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath11k/dp.c
> b/drivers/net/wireless/ath/ath11k/dp.c
> index b0c8f6290099..cf869ebc209a 100644
> --- a/drivers/net/wireless/ath/ath11k/dp.c
> +++ b/drivers/net/wireless/ath/ath11k/dp.c
> @@ -101,8 +101,11 @@ void ath11k_dp_srng_cleanup(struct ath11k_base
> *ab,
> struct dp_srng *ring)
> if (!ring->vaddr_unaligned)
> return;
>
> - dma_free_coherent(ab->dev, ring->size, ring->vaddr_unaligned,
> - ring->paddr_unaligned);
> + if (ring->cached)
> + kfree(ring->vaddr_unaligned);
> + else
> + dma_free_coherent(ab->dev, ring->size,
> ring->vaddr_unaligned,
> + ring->paddr_unaligned);
>
> ring->vaddr_unaligned = NULL;
> }
> @@ -222,6 +225,7 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab,
> struct dp_srng *ring,
> int entry_sz = ath11k_hal_srng_get_entrysize(ab, type);
> int max_entries = ath11k_hal_srng_get_max_entries(ab, type);
> int ret;
> + bool cached;
>
> if (max_entries < 0 || entry_sz < 0)
> return -EINVAL;
> @@ -229,10 +233,25 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab,
> struct dp_srng *ring,
> if (num_entries > max_entries)
> num_entries = max_entries;
>
> + /* Allocate the reo dst and tx completion rings from cacheable
> memory */
> + switch (type) {
> + case HAL_REO_DST:
> + cached = true;
> + default:
> + cached = false;
> + }
> +
> ring->size = (num_entries * entry_sz) + HAL_RING_BASE_ALIGN - 1;
> - ring->vaddr_unaligned = dma_alloc_coherent(ab->dev, ring->size,
> - &ring->paddr_unaligned,
> - GFP_KERNEL);
> +
> + if (cached) {
> + ring->vaddr_unaligned = kzalloc(ring->size, GFP_KERNEL);
> + ring->paddr_unaligned = virt_to_phys(ring->vaddr_unaligned);
Internal developers found this is causing a fault in rx data path.
Suggested fix:
- ring->paddr_unaligned =
virt_to_phys(ring->vaddr_unaligned);
+ ring->paddr_unaligned = dma_map_single(ab->dev,
ring->vaddr_unaligned,
+ ring->size,
DMA_FROM_DEVICE);
> + } else {
> + ring->vaddr_unaligned = dma_alloc_coherent(ab->dev,
> ring->size,
> +
> &ring->paddr_unaligned,
> + GFP_KERNEL);
> + }
> +
> if (!ring->vaddr_unaligned)
> return -ENOMEM;
>
> @@ -292,6 +311,11 @@ int ath11k_dp_srng_setup(struct ath11k_base *ab,
> struct dp_srng *ring,
> return -EINVAL;
> }
>
> + if (cached) {
> + params.flags |= HAL_SRNG_FLAGS_CACHED;
> + ring->cached = 1;
> + }
> +
> ret = ath11k_hal_srng_setup(ab, type, ring_num, mac_id, ¶ms);
> if (ret < 0) {
> ath11k_warn(ab, "failed to setup srng: %d ring_id %d\n",
> diff --git a/drivers/net/wireless/ath/ath11k/dp.h
> b/drivers/net/wireless/ath/ath11k/dp.h
> index ee768ccce46e..e6591488a28c 100644
> --- a/drivers/net/wireless/ath/ath11k/dp.h
> +++ b/drivers/net/wireless/ath/ath11k/dp.h
> @@ -64,6 +64,7 @@ struct dp_srng {
> dma_addr_t paddr;
> int size;
> u32 ring_id;
> + u8 cached;
> };
>
> struct dp_rxdma_ring {
> diff --git a/drivers/net/wireless/ath/ath11k/hal.c
> b/drivers/net/wireless/ath/ath11k/hal.c
> index eaa0edca5576..a58e86e42b5b 100644
> --- a/drivers/net/wireless/ath/ath11k/hal.c
> +++ b/drivers/net/wireless/ath/ath11k/hal.c
> @@ -627,6 +627,21 @@ u32 *ath11k_hal_srng_dst_peek(struct ath11k_base
> *ab,
> struct hal_srng *srng)
> return NULL;
> }
>
> +static void ath11k_hal_srng_prefetch_desc(struct ath11k_base *ab,
> + struct hal_srng *srng)
> +{
> + u32 *desc;
> +
> + /* prefetch only if desc is available */
> + desc = ath11k_hal_srng_dst_peek(ab, srng);
> + if (likely(desc)) {
> + dma_sync_single_for_cpu(ab->dev, virt_to_phys(desc),
> + (srng->entry_size * sizeof(u32)),
> + DMA_FROM_DEVICE);
> + prefetch(desc);
> + }
> +}
> +
> u32 *ath11k_hal_srng_dst_get_next_entry(struct ath11k_base *ab,
> struct hal_srng *srng)
> {
> @@ -642,6 +657,10 @@ u32 *ath11k_hal_srng_dst_get_next_entry(struct
> ath11k_base *ab,
> srng->u.dst_ring.tp = (srng->u.dst_ring.tp + srng->entry_size) %
> srng->ring_size;
>
> + /* Try to prefetch the next descriptor in the ring */
> + if (srng->flags & HAL_SRNG_FLAGS_CACHED)
> + ath11k_hal_srng_prefetch_desc(ab, srng);
> +
> return desc;
> }
>
> @@ -775,11 +794,13 @@ void ath11k_hal_srng_access_begin(struct
> ath11k_base
> *ab, struct hal_srng *srng)
> {
> lockdep_assert_held(&srng->lock);
>
> - if (srng->ring_dir == HAL_SRNG_DIR_SRC)
> + if (srng->ring_dir == HAL_SRNG_DIR_SRC) {
> srng->u.src_ring.cached_tp =
> *(volatile u32 *)srng->u.src_ring.tp_addr;
> - else
> + } else {
> srng->u.dst_ring.cached_hp = *srng->u.dst_ring.hp_addr;
> + ath11k_hal_srng_prefetch_desc(ab, srng);
> + }
> }
>
> /* Update cached ring head/tail pointers to HW.
> ath11k_hal_srng_access_begin()
> diff --git a/drivers/net/wireless/ath/ath11k/hal.h
> b/drivers/net/wireless/ath/ath11k/hal.h
> index 35ed3a14e200..0f4f9ce74354 100644
> --- a/drivers/net/wireless/ath/ath11k/hal.h
> +++ b/drivers/net/wireless/ath/ath11k/hal.h
> @@ -513,6 +513,7 @@ enum hal_srng_dir {
> #define HAL_SRNG_FLAGS_DATA_TLV_SWAP 0x00000020
> #define HAL_SRNG_FLAGS_LOW_THRESH_INTR_EN 0x00010000
> #define HAL_SRNG_FLAGS_MSI_INTR 0x00020000
> +#define HAL_SRNG_FLAGS_CACHED 0x20000000
> #define HAL_SRNG_FLAGS_LMAC_RING 0x80000000
>
> #define HAL_SRNG_TLV_HDR_TAG GENMASK(9, 1)
> --
> 2.25.1
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum,
a Linux Foundation Collaborative Project