Subject: [PATCH V3 1/2] ath6kl: Fix race in aggregation reorder logic

There are many places where tid data are accessed without
the lock (rxtid->lock), this can lead to a race condition
when the timeout handler for aggregatin reorder and the
receive function are getting executed at the same time.
Fix this race, but still there are races which can not
be fixed without rewriting the whole aggregation reorder
logic, for now fix the obvious ones.

Signed-off-by: Vasanthakumar Thiagarajan <[email protected]>
---
V3 - Move lock outside of the loop, this looks simple and harmless
- Fix FIXME on rxtid->lock, add comment on this lock.

drivers/net/wireless/ath/ath6kl/core.h | 12 +++++++++---
drivers/net/wireless/ath/ath6kl/txrx.c | 12 +++++++++---
2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/core.h b/drivers/net/wireless/ath/ath6kl/core.h
index b1bc6bc..d6005bd 100644
--- a/drivers/net/wireless/ath/ath6kl/core.h
+++ b/drivers/net/wireless/ath/ath6kl/core.h
@@ -273,9 +273,15 @@ struct rxtid {
struct sk_buff_head q;

/*
- * FIXME: No clue what this should protect. Apparently it should
- * protect some of the fields above but they are also accessed
- * without taking the lock.
+ * lock mainly protects seq_next and hold_q. Movement of seq_next
+ * needs to be protected between aggr_timeout() and
+ * aggr_process_recv_frm(). hold_q will be holding the pending
+ * reorder frames and it's access should also be protected.
+ * Some of the other fields like hold_q_sz, win_sz and aggr are
+ * initialized/reset when receiving addba/delba req, also while
+ * deleting aggr state all the pending buffers are flushed before
+ * resetting these fields, so there should not be any race in accessing
+ * these fields.
*/
spinlock_t lock;
};
diff --git a/drivers/net/wireless/ath/ath6kl/txrx.c b/drivers/net/wireless/ath/ath6kl/txrx.c
index 67206ae..974c510 100644
--- a/drivers/net/wireless/ath/ath6kl/txrx.c
+++ b/drivers/net/wireless/ath/ath6kl/txrx.c
@@ -1036,6 +1036,7 @@ static void aggr_deque_frms(struct aggr_info_conn *agg_conn, u8 tid,
rxtid = &agg_conn->rx_tid[tid];
stats = &agg_conn->stat[tid];

+ spin_lock_bh(&rxtid->lock);
idx = AGGR_WIN_IDX(rxtid->seq_next, rxtid->hold_q_sz);

/*
@@ -1054,8 +1055,6 @@ static void aggr_deque_frms(struct aggr_info_conn *agg_conn, u8 tid,
seq_end = seq_no ? seq_no : rxtid->seq_next;
idx_end = AGGR_WIN_IDX(seq_end, rxtid->hold_q_sz);

- spin_lock_bh(&rxtid->lock);
-
do {
node = &rxtid->hold_q[idx];
if ((order == 1) && (!node->skb))
@@ -1127,11 +1126,13 @@ static bool aggr_process_recv_frm(struct aggr_info_conn *agg_conn, u8 tid,
((end > extended_end) && (cur > extended_end) &&
(cur < end))) {
aggr_deque_frms(agg_conn, tid, 0, 0);
+ spin_lock_bh(&rxtid->lock);
if (cur >= rxtid->hold_q_sz - 1)
rxtid->seq_next = cur - (rxtid->hold_q_sz - 1);
else
rxtid->seq_next = ATH6KL_MAX_SEQ_NO -
(rxtid->hold_q_sz - 2 - cur);
+ spin_unlock_bh(&rxtid->lock);
} else {
/*
* Dequeue only those frames that are outside the
@@ -1186,7 +1187,8 @@ static bool aggr_process_recv_frm(struct aggr_info_conn *agg_conn, u8 tid,

if (agg_conn->timer_scheduled)
rxtid->progress = true;
- else
+ else {
+ spin_lock_bh(&rxtid->lock);
for (idx = 0 ; idx < rxtid->hold_q_sz; idx++) {
if (rxtid->hold_q[idx].skb) {
/*
@@ -1204,6 +1206,8 @@ static bool aggr_process_recv_frm(struct aggr_info_conn *agg_conn, u8 tid,
break;
}
}
+ spin_unlock_bh(&rxtid->lock);
+ }

return is_queued;
}
@@ -1626,6 +1630,7 @@ static void aggr_timeout(unsigned long arg)
rxtid = &aggr_conn->rx_tid[i];

if (rxtid->aggr && rxtid->hold_q) {
+ spin_lock_bh(&rxtid->lock);
for (j = 0; j < rxtid->hold_q_sz; j++) {
if (rxtid->hold_q[j].skb) {
aggr_conn->timer_scheduled = true;
@@ -1634,6 +1639,7 @@ static void aggr_timeout(unsigned long arg)
break;
}
}
+ spin_unlock_bh(&rxtid->lock);

if (j >= rxtid->hold_q_sz)
rxtid->timer_mon = false;
--
1.7.0.4



Subject: [PATCH V3 2/2] ath6kl: Fix unstable downlink throughput

There is frequent downlink throughput drop to 0 when operating
at the signal level between -42dBm to -53dBm. This has been root
caused to the delay in releasing pending a-mpdu subframes in
reorder buffer. Right now the timeout value is 400ms, there
is also a race condition where timeout handler can be delayed
to run at an extra timeout interval. This patch reduces the
timout interval to reasonable 100ms and makes sure releasing
pending frames are not skipped in the timeout handler by removing
the flag (rxtid->progress) which can delay the timeout logic.

Reported-by: Yu Yanzhi <[email protected]>
Signed-off-by: Vasanthakumar Thiagarajan <[email protected]>
---
V2 - Fix code alignment
V3 - Move lock to outside of loop

drivers/net/wireless/ath/ath6kl/core.h | 3 +-
drivers/net/wireless/ath/ath6kl/txrx.c | 42 +++++++++++++------------------
2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/core.h b/drivers/net/wireless/ath/ath6kl/core.h
index d6005bd..0bfef2d 100644
--- a/drivers/net/wireless/ath/ath6kl/core.h
+++ b/drivers/net/wireless/ath/ath6kl/core.h
@@ -215,7 +215,7 @@ enum ath6kl_hw_flags {

#define AGGR_NUM_OF_FREE_NETBUFS 16

-#define AGGR_RX_TIMEOUT 400 /* in ms */
+#define AGGR_RX_TIMEOUT 100 /* in ms */

#define WMI_TIMEOUT (2 * HZ)

@@ -264,7 +264,6 @@ struct skb_hold_q {

struct rxtid {
bool aggr;
- bool progress;
bool timer_mon;
u16 win_sz;
u16 seq_next;
diff --git a/drivers/net/wireless/ath/ath6kl/txrx.c b/drivers/net/wireless/ath/ath6kl/txrx.c
index 974c510..7dfa0fd 100644
--- a/drivers/net/wireless/ath/ath6kl/txrx.c
+++ b/drivers/net/wireless/ath/ath6kl/txrx.c
@@ -1186,28 +1186,25 @@ static bool aggr_process_recv_frm(struct aggr_info_conn *agg_conn, u8 tid,
aggr_deque_frms(agg_conn, tid, 0, 1);

if (agg_conn->timer_scheduled)
- rxtid->progress = true;
- else {
- spin_lock_bh(&rxtid->lock);
- for (idx = 0 ; idx < rxtid->hold_q_sz; idx++) {
- if (rxtid->hold_q[idx].skb) {
- /*
- * There is a frame in the queue and no
- * timer so start a timer to ensure that
- * the frame doesn't remain stuck
- * forever.
- */
- agg_conn->timer_scheduled = true;
- mod_timer(&agg_conn->timer,
- (jiffies +
- HZ * (AGGR_RX_TIMEOUT) / 1000));
- rxtid->progress = false;
- rxtid->timer_mon = true;
- break;
- }
+ return is_queued;
+
+ spin_lock_bh(&rxtid->lock);
+ for (idx = 0 ; idx < rxtid->hold_q_sz; idx++) {
+ if (rxtid->hold_q[idx].skb) {
+ /*
+ * There is a frame in the queue and no
+ * timer so start a timer to ensure that
+ * the frame doesn't remain stuck
+ * forever.
+ */
+ agg_conn->timer_scheduled = true;
+ mod_timer(&agg_conn->timer,
+ (jiffies + (HZ * AGGR_RX_TIMEOUT) / 1000));
+ rxtid->timer_mon = true;
+ break;
}
- spin_unlock_bh(&rxtid->lock);
}
+ spin_unlock_bh(&rxtid->lock);

return is_queued;
}
@@ -1612,7 +1609,7 @@ static void aggr_timeout(unsigned long arg)
rxtid = &aggr_conn->rx_tid[i];
stats = &aggr_conn->stat[i];

- if (!rxtid->aggr || !rxtid->timer_mon || rxtid->progress)
+ if (!rxtid->aggr || !rxtid->timer_mon)
continue;

stats->num_timeouts++;
@@ -1635,7 +1632,6 @@ static void aggr_timeout(unsigned long arg)
if (rxtid->hold_q[j].skb) {
aggr_conn->timer_scheduled = true;
rxtid->timer_mon = true;
- rxtid->progress = false;
break;
}
}
@@ -1666,7 +1662,6 @@ static void aggr_delete_tid_state(struct aggr_info_conn *aggr_conn, u8 tid)
aggr_deque_frms(aggr_conn, tid, 0, 0);

rxtid->aggr = false;
- rxtid->progress = false;
rxtid->timer_mon = false;
rxtid->win_sz = 0;
rxtid->seq_next = 0;
@@ -1745,7 +1740,6 @@ void aggr_conn_init(struct ath6kl_vif *vif, struct aggr_info *aggr_info,
for (i = 0; i < NUM_OF_TIDS; i++) {
rxtid = &aggr_conn->rx_tid[i];
rxtid->aggr = false;
- rxtid->progress = false;
rxtid->timer_mon = false;
skb_queue_head_init(&rxtid->q);
spin_lock_init(&rxtid->lock);
--
1.7.0.4


2012-06-11 13:14:26

by Kalle Valo

[permalink] [raw]
Subject: Re: [PATCH V3 1/2] ath6kl: Fix race in aggregation reorder logic

On 05/30/2012 09:57 AM, Vasanthakumar Thiagarajan wrote:
> There are many places where tid data are accessed without
> the lock (rxtid->lock), this can lead to a race condition
> when the timeout handler for aggregatin reorder and the
> receive function are getting executed at the same time.
> Fix this race, but still there are races which can not
> be fixed without rewriting the whole aggregation reorder
> logic, for now fix the obvious ones.
>
> Signed-off-by: Vasanthakumar Thiagarajan <[email protected]>

Thanks, both patches applied.

Kalle