2003-07-17 22:01:19

by Daniel Ritz

[permalink] [raw]
Subject: [PATCH 2.4] fixes for airo.c

hello

in 2.4.20+ airo.c is broken and can even kill keventd. this patch fixes it:
- sane locking with spinlocks and irqs disabled instead of the buggy locking
with semaphores
- fix transmit code
- safer unload ordering of disable irq, unregister_netdev, kfree
- fix possible loop-forever bug
- fix a buffer overflow

a previous version of the patch is tested by some people with good results.
against 2.4.22-pre6-bk. please apply.


rgds
-daniel


ps: a 2.6 version will follow soon...




--- 1.25/drivers/net/wireless/airo.c Wed Apr 23 19:43:55 2003
+++ edited/airo.c Thu Jul 17 22:19:02 2003
@@ -933,9 +933,8 @@
static void disable_MAC(struct airo_info *ai);
static void enable_interrupts(struct airo_info*);
static void disable_interrupts(struct airo_info*);
-static u16 issuecommand(struct airo_info*, Cmd *pCmd, Resp *pRsp);
-static u16 sendcommand(struct airo_info *ai, Cmd *pCmd);
-static void completecommand(struct airo_info *ai, Resp *pRsp);
+static int issuecommand(struct airo_info*, Cmd *pCmd, Resp *pRsp);
+static int issuecommand_nolock(struct airo_info*, Cmd *pCmd, Resp *pRsp);
static int bap_setup(struct airo_info*, u16 rid, u16 offset, int whichbap);
static int aux_bap_read(struct airo_info*, u16 *pu16Dst, int bytelen,
int whichbap);
@@ -945,13 +944,14 @@
int whichbap);
static int PC4500_accessrid(struct airo_info*, u16 rid, u16 accmd);
static int PC4500_readrid(struct airo_info*, u16 rid, void *pBuf, int len);
+static int PC4500_readrid_nolock(struct airo_info*, u16 rid, void *pBuf, int len);
static int PC4500_writerid(struct airo_info*, u16 rid, const void
*pBuf, int len);
static int do_writerid( struct airo_info*, u16 rid, const void *rid_data,
int len );
static u16 transmit_allocate(struct airo_info*, int lenPayload, int raw);
-static int transmit_802_3_packet(struct airo_info*, int len, char *pPacket);
-static int transmit_802_11_packet(struct airo_info*, int len, char *pPacket);
+static int transmit_802_3_packet(struct airo_info*, u16 txFid, char *pPacket, int len);
+static int transmit_802_11_packet(struct airo_info*, u16 txFid, char *pPacket, int len);

static void airo_interrupt( int irq, void* dev_id, struct pt_regs
*regs);
@@ -987,12 +987,11 @@
struct timer_list timer;
struct proc_dir_entry *proc_entry;
struct airo_info *next;
- spinlock_t aux_lock;
+ spinlock_t main_lock;
unsigned long flags;
#define FLAG_PROMISC IFF_PROMISC /* 0x100 - include/linux/if.h */
#define FLAG_RADIO_OFF 0x02 /* User disabling of MAC */
#define FLAG_RADIO_DOWN 0x08 /* ifup/ifdown disabling of MAC */
-#define FLAG_LOCKED 2 /* 0x04 - use as a bit offset */
#define FLAG_FLASHING 0x10
#define FLAG_ADHOC 0x01 /* Needed by MIC */
#define FLAG_MIC_CAPABLE 0x20
@@ -1003,14 +1002,8 @@
int whichbap);
unsigned short *flash;
tdsRssiEntry *rssi;
- struct semaphore sem;
struct task_struct *task;
struct tq_struct promisc_task;
- struct {
- struct sk_buff *skb;
- int fid;
- struct tq_struct task;
- } xmit, xmit11;
struct net_device *wifidev;
#ifdef WIRELESS_EXT
struct iw_statistics wstats; // wireless stats
@@ -1051,10 +1044,8 @@
if (first == 1) {
memset(&cmd, 0, sizeof(cmd));
cmd.cmd=CMD_LISTBSS;
- if (down_interruptible(&ai->sem))
- return -ERESTARTSYS;
issuecommand(ai, &cmd, &rsp);
- up(&ai->sem);
+
/* Let the command take effect */
set_current_state (TASK_INTERRUPTIBLE);
ai->task = current;
@@ -1199,7 +1190,7 @@
statr->len = le16_to_cpu(statr->len);
for(s = &statr->mode; s <= &statr->SSIDlen; s++) *s = le16_to_cpu(*s);

- for(s = &statr->beaconPeriod; s <= &statr->_reserved[9]; s++)
+ for(s = &statr->beaconPeriod; s <= &statr->_reserved1; s++)
*s = le16_to_cpu(*s);

return rc;
@@ -1312,44 +1303,15 @@
}
}

-static void airo_do_xmit(struct net_device *dev) {
- u16 status;
- int i;
- struct airo_info *priv = dev->priv;
- struct sk_buff *skb = priv->xmit.skb;
- int fid = priv->xmit.fid;
- u32 *fids = priv->fids;
-
- if (down_trylock(&priv->sem) != 0) {
- netif_stop_queue(dev);
- priv->xmit.task.routine = (void (*)(void *))airo_do_xmit;
- priv->xmit.task.data = (void *)dev;
- schedule_task(&priv->xmit.task);
- return;
- }
- status = transmit_802_3_packet (priv, fids[fid], skb->data);
- up(&priv->sem);
-
- i = 0;
- if ( status == SUCCESS ) {
- dev->trans_start = jiffies;
- for (; i < MAX_FIDS / 2 && (priv->fids[i] & 0xffff0000); i++);
- } else {
- priv->fids[fid] &= 0xffff;
- priv->stats.tx_window_errors++;
- }
- if (i < MAX_FIDS / 2)
- netif_wake_queue(dev);
- else
- netif_stop_queue(dev);
- dev_kfree_skb(skb);
-}
-
-static int airo_start_xmit(struct sk_buff *skb, struct net_device *dev) {
- s16 len;
+static int airo_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ int len;
int i;
+ int ret = 0;
+ int status;
+ unsigned long flags;
struct airo_info *priv = dev->priv;
- u32 *fids = priv->fids;
+ int *fids = priv->fids;

if ( skb == NULL ) {
printk( KERN_ERR "airo: skb == NULL!!!\n" );
@@ -1357,61 +1319,50 @@
}

/* Find a vacant FID */
- for( i = 0; i < MAX_FIDS / 2 && (fids[i] & 0xffff0000); i++ );
+ spin_lock_irqsave(&priv->main_lock, flags);
+ for (i = 0; i < MAX_FIDS / 2 && (fids[i] & 0xffff0000); i++);

- if ( i == MAX_FIDS / 2 ) {
- priv->stats.tx_fifo_errors++;
- dev_kfree_skb(skb);
- } else {
- /* check min length*/
- len = ETH_ZLEN < skb->len ? skb->len : ETH_ZLEN;
- /* Mark fid as used & save length for later */
- fids[i] |= (len << 16);
- priv->xmit.skb = skb;
- priv->xmit.fid = i;
- airo_do_xmit(dev);
- }
- return 0;
-}
-
-static void airo_do_xmit11(struct net_device *dev) {
- u16 status;
- int i;
- struct airo_info *priv = dev->priv;
- struct sk_buff *skb = priv->xmit11.skb;
- int fid = priv->xmit11.fid;
- u32 *fids = priv->fids;
-
- if (down_trylock(&priv->sem) != 0) {
+ if (i + 1 >= MAX_FIDS / 2) {
netif_stop_queue(dev);
- priv->xmit11.task.routine = (void (*)(void *))airo_do_xmit11;
- priv->xmit11.task.data = (void *)dev;
- schedule_task(&priv->xmit11.task);
- return;
+
+ /* we cannot transmit */
+ if (i == MAX_FIDS / 2) {
+ priv->stats.tx_fifo_errors++;
+ ret = 1;
+ goto tx_done;
+ }
}
- status = transmit_802_11_packet (priv, fids[fid], skb->data);
- up(&priv->sem);
+
+ /* check min length*/
+ len = ETH_ZLEN < skb->len ? skb->len : ETH_ZLEN;
+ status = transmit_802_3_packet (priv, fids[i], skb->data, len);

- i = MAX_FIDS / 2;
- if ( status == SUCCESS ) {
+ if (status == SUCCESS) {
+ /* Mark fid as used & save length for later */
+ fids[i] |= (len << 16);
dev->trans_start = jiffies;
- for (; i < MAX_FIDS && (priv->fids[i] & 0xffff0000); i++);
- } else {
- priv->fids[fid] &= 0xffff;
+ }
+ else {
priv->stats.tx_window_errors++;
+ ret = 1;
}
- if (i < MAX_FIDS)
- netif_wake_queue(dev);
- else
- netif_stop_queue(dev);
- dev_kfree_skb(skb);
+
+tx_done:
+ spin_unlock_irqrestore(&priv->main_lock, flags);
+ if (!ret)
+ dev_kfree_skb(skb);
+ return ret;
}

-static int airo_start_xmit11(struct sk_buff *skb, struct net_device *dev) {
- s16 len;
+static int airo_start_xmit11(struct sk_buff *skb, struct net_device *dev)
+{
+ int len;
int i;
+ int ret = 0;
+ int status;
+ unsigned long flags;
struct airo_info *priv = dev->priv;
- u32 *fids = priv->fids;
+ int *fids = priv->fids;

if ( skb == NULL ) {
printk( KERN_ERR "airo: skb == NULL!!!\n" );
@@ -1419,21 +1370,39 @@
}

/* Find a vacant FID */
+ spin_lock_irqsave(&priv->main_lock, flags);
for( i = MAX_FIDS / 2; i < MAX_FIDS && (fids[i] & 0xffff0000); i++ );

- if ( i == MAX_FIDS ) {
- priv->stats.tx_fifo_errors++;
- dev_kfree_skb(skb);
- } else {
- /* check min length*/
- len = ETH_ZLEN < skb->len ? skb->len : ETH_ZLEN;
- /* Mark fid as used & save length for later */
+ if (i + 1 >= MAX_FIDS) {
+ netif_stop_queue(dev);
+
+ /* we cannot transmit */
+ if (i == MAX_FIDS) {
+ priv->stats.tx_fifo_errors++;
+ ret = 1;
+ goto tx_done;
+ }
+ }
+
+ /* check min length*/
+ len = ETH_ZLEN < skb->len ? skb->len : ETH_ZLEN;
+ status = transmit_802_11_packet (priv, fids[i], skb->data, len);
+
+ if (status == SUCCESS) {
+ /* Mark fid as used & save length for later */
fids[i] |= (len << 16);
- priv->xmit11.skb = skb;
- priv->xmit11.fid = i;
- airo_do_xmit11(dev);
+ dev->trans_start = jiffies;
}
- return 0;
+ else {
+ priv->stats.tx_window_errors++;
+ ret = 1;
+ }
+
+tx_done:
+ spin_unlock_irqrestore(&priv->main_lock, flags);
+ if (!ret)
+ dev_kfree_skb(skb);
+ return ret;
}

struct net_device_stats *airo_get_stats(struct net_device *dev)
@@ -1463,36 +1432,19 @@
return &local->stats;
}

-static void airo_end_promisc(struct airo_info *ai) {
- Resp rsp;
-
- if ((IN4500(ai, EVSTAT) & EV_CMD) != 0) {
- completecommand(ai, &rsp);
- up(&ai->sem);
- } else {
- ai->promisc_task.routine = (void (*)(void *))airo_end_promisc;
- ai->promisc_task.data = (void *)ai;
- schedule_task(&ai->promisc_task);
- }
-}
-
-static void airo_set_promisc(struct airo_info *ai) {
+static void airo_set_promisc(struct airo_info *ai)
+{
Cmd cmd;
+ Resp rsp;

- if (down_trylock(&ai->sem) == 0) {
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd=CMD_SETMODE;
- cmd.parm0=(ai->flags&IFF_PROMISC) ? PROMISC : NOPROMISC;
- sendcommand(ai, &cmd);
- airo_end_promisc(ai);
- } else {
- ai->promisc_task.routine = (void (*)(void *))airo_set_promisc;
- ai->promisc_task.data = (void *)ai;
- schedule_task(&ai->promisc_task);
- }
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = CMD_SETMODE;
+ cmd.parm0 = (ai->flags & IFF_PROMISC) ? PROMISC : NOPROMISC;
+ issuecommand(ai, &cmd, &rsp);
}

-static void airo_set_multicast_list(struct net_device *dev) {
+static void airo_set_multicast_list(struct net_device *dev)
+{
struct airo_info *ai = dev->priv;

if ((dev->flags ^ ai->flags) & IFF_PROMISC) {
@@ -1557,11 +1509,10 @@
{
struct airo_info *ai = dev->priv;
flush_scheduled_tasks();
- if (ai->flash)
- kfree(ai->flash);
- if (ai->rssi)
- kfree(ai->rssi);
- takedown_proc_entry( dev, ai );
+
+ disable_interrupts(ai);
+ free_irq(dev->irq, dev);
+
if (ai->registered) {
unregister_netdev( dev );
if (ai->wifidev) {
@@ -1571,9 +1522,15 @@
}
ai->registered = 0;
}
- disable_interrupts(ai);
- free_irq( dev->irq, dev );
- if (auto_wep) del_timer_sync(&ai->timer);
+
+ if (ai->flash)
+ kfree(ai->flash);
+ if (ai->rssi)
+ kfree(ai->rssi);
+ takedown_proc_entry( dev, ai );
+
+ if (auto_wep)
+ del_timer_sync(&ai->timer);
if (freeres) {
/* PCMCIA frees this stuff, so only for PCI and ISA */
release_region( dev->base_addr, 64 );
@@ -1670,8 +1627,7 @@
ai->wifidev = 0;
ai->registered = 0;
ai->dev = dev;
- ai->aux_lock = SPIN_LOCK_UNLOCKED;
- sema_init(&ai->sem, 1);
+ ai->main_lock = SPIN_LOCK_UNLOCKED;
ai->need_commit = 0;
ai->config.len = 0;
rc = add_airo_dev( dev );
@@ -1736,7 +1692,6 @@
ai->fids[i] = transmit_allocate(ai,2312,i>=MAX_FIDS/2);

setup_proc_entry( dev, dev->priv ); /* XXX check for failure */
- netif_start_queue(dev);
SET_MODULE_OWNER(dev);
return dev;

@@ -1800,47 +1755,31 @@
EXPORT_SYMBOL(reset_airo_card);

#if WIRELESS_EXT > 13
+/* must be called with lock held */
static void airo_send_event(struct net_device *dev) {
struct airo_info *ai = dev->priv;
union iwreq_data wrqu;
StatusRid status_rid;

- if (down_trylock(&ai->sem) == 0) {
- __set_bit(FLAG_LOCKED, &ai->flags);
- PC4500_readrid(ai, RID_STATUS, &status_rid, sizeof(status_rid));
- clear_bit(FLAG_LOCKED, &ai->flags);
- up(&ai->sem);
- wrqu.data.length = 0;
- wrqu.data.flags = 0;
- memcpy(wrqu.ap_addr.sa_data, status_rid.bssid[0], ETH_ALEN);
- wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+ PC4500_readrid_nolock(ai, RID_STATUS, &status_rid, sizeof(status_rid));

- /* Send event to user space */
- wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
- } else {
- ai->event_task.routine = (void (*)(void *))airo_send_event;
- ai->event_task.data = (void *)dev;
- schedule_task(&ai->event_task);
- }
+ wrqu.data.length = 0;
+ wrqu.data.flags = 0;
+ memcpy(wrqu.ap_addr.sa_data, status_rid.bssid[0], ETH_ALEN);
+ wrqu.ap_addr.sa_family = ARPHRD_ETHER;
+
+ /* Send event to user space */
+ wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL);
}
#endif

static void airo_read_mic(struct airo_info *ai) {
+#ifdef MICSUPPORT
MICRid mic_rid;

- if (down_trylock(&ai->sem) == 0) {
- __set_bit(FLAG_LOCKED, &ai->flags);
- PC4500_readrid(ai, RID_MIC, &mic_rid, sizeof(mic_rid));
- clear_bit(FLAG_LOCKED, &ai->flags);
- up(&ai->sem);
-#ifdef MICSUPPORT
- micinit (ai, &mic_rid);
+ PC4500_readrid(ai, RID_MIC, &mic_rid, sizeof(mic_rid));
+ micinit (ai, &mic_rid);
#endif
- } else {
- ai->mic_task.routine = (void (*)(void *))airo_read_mic;
- ai->mic_task.data = (void *)ai;
- schedule_task(&ai->mic_task);
- }
}

static void airo_interrupt ( int irq, void* dev_id, struct pt_regs *regs) {
@@ -1853,6 +1792,7 @@
if (!netif_device_present(dev))
return;

+ spin_lock(&apriv->main_lock);
for (;;) {
status = IN4500( apriv, EVSTAT );
if ( !(status & STATUS_INTS) || status == 0xffff ) break;
@@ -1869,7 +1809,8 @@

if ( status & EV_MIC ) {
OUT4500( apriv, EVACK, EV_MIC );
- airo_read_mic( apriv );
+ if (apriv->flags & FLAG_MIC_CAPABLE)
+ airo_read_mic( apriv );
}
if ( status & EV_LINK ) {
#if WIRELESS_EXT > 13
@@ -2118,10 +2059,14 @@
index = i;
/* Set up to be used again */
apriv->fids[i] &= 0xffff;
+
+ if (i < MAX_FIDS / 2)
+ netif_wake_queue(dev);
+ else
+ netif_wake_queue(apriv->wifidev);
}
}
if (index != -1) {
- netif_wake_queue(dev);
if (status & EV_TXEXC)
get_tx_error(apriv, index);
}
@@ -2137,6 +2082,7 @@

if (savedInterrupts)
OUT4500( apriv, EVINTEN, savedInterrupts );
+ spin_unlock(&apriv->main_lock);

/* done.. */
return;
@@ -2172,8 +2118,8 @@
return rc;
}

-static int enable_MAC( struct airo_info *ai, Resp *rsp ) {
- int rc;
+static int enable_MAC( struct airo_info *ai, Resp *rsp )
+{
Cmd cmd;

/* FLAG_RADIO_OFF : Radio disabled via /proc or Wireless Extensions
@@ -2185,45 +2131,41 @@
if (ai->flags & (FLAG_RADIO_OFF|FLAG_RADIO_DOWN)) return SUCCESS;
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = MAC_ENABLE;
- if (test_bit(FLAG_LOCKED, &ai->flags) != 0)
- return issuecommand(ai, &cmd, rsp);
-
- if (down_interruptible(&ai->sem))
- return -ERESTARTSYS;
- rc = issuecommand(ai, &cmd, rsp);
- up(&ai->sem);
- return rc;
+ return issuecommand(ai, &cmd, rsp);
}

-static void disable_MAC( struct airo_info *ai ) {
+static void disable_MAC(struct airo_info *ai)
+{
Cmd cmd;
Resp rsp;

memset(&cmd, 0, sizeof(cmd));
cmd.cmd = MAC_DISABLE; // disable in case already enabled
- if (test_bit(FLAG_LOCKED, &ai->flags) != 0) {
- issuecommand(ai, &cmd, &rsp);
- return;
- }
-
- if (down_interruptible(&ai->sem))
- return;
issuecommand(ai, &cmd, &rsp);
- up(&ai->sem);
}

-static void enable_interrupts( struct airo_info *ai ) {
+static void enable_interrupts(struct airo_info *ai)
+{
+ unsigned long flags;
+ u16 status;
+ spin_lock_irqsave(&ai->main_lock, flags);
+
/* Reset the status register */
- u16 status = IN4500( ai, EVSTAT );
+ status = IN4500( ai, EVSTAT );
OUT4500( ai, EVACK, status );
+
/* Enable the interrupts */
OUT4500( ai, EVINTEN, STATUS_INTS );
- /* Note there is a race condition between the last two lines that
- I dont know how to get rid of right now... */
+
+ spin_unlock_irqrestore(&ai->main_lock, flags);
}

-static void disable_interrupts( struct airo_info *ai ) {
+static void disable_interrupts(struct airo_info *ai)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&ai->main_lock, flags);
OUT4500( ai, EVINTEN, 0 );
+ spin_unlock_irqrestore(&ai->main_lock, flags);
}

static u16 setup_card(struct airo_info *ai, u8 *mac)
@@ -2246,23 +2188,20 @@
/* The NOP is the first step in getting the card going */
cmd.cmd = NOP;
cmd.parm0 = cmd.parm1 = cmd.parm2 = 0;
- if (down_interruptible(&ai->sem))
+ if (spin_is_locked(&ai->main_lock))
return ERROR;
- if ( issuecommand( ai, &cmd, &rsp ) != SUCCESS ) {
- up(&ai->sem);
+ if (issuecommand(ai, &cmd, &rsp) != SUCCESS)
return ERROR;
- }
+
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = MAC_DISABLE; // disable in case already enabled
- if ( issuecommand( ai, &cmd, &rsp ) != SUCCESS ) {
- up(&ai->sem);
+ if (issuecommand( ai, &cmd, &rsp) != SUCCESS )
return ERROR;
- }
+

// Let's figure out if we need to use the AUX port
cmd.cmd = CMD_ENABLEAUX;
if (issuecommand(ai, &cmd, &rsp) != SUCCESS) {
- up(&ai->sem);
printk(KERN_ERR "airo: Error checking for AUX port\n");
return ERROR;
}
@@ -2273,7 +2212,7 @@
ai->bap_read = aux_bap_read;
printk(KERN_DEBUG "airo: Doing AUX bap_reads\n");
}
- up(&ai->sem);
+
if (ai->config.len == 0) {
tdsRssiRid rssi_rid;
CapabilityRid cap_rid;
@@ -2378,50 +2317,35 @@
}
return SUCCESS;
}
+static int issuecommand(struct airo_info *ai, Cmd *pCmd, Resp *pRsp) {
+ int rc;
+ unsigned long flags;

-static u16 issuecommand(struct airo_info *ai, Cmd *pCmd, Resp *pRsp) {
- // Im really paranoid about letting it run forever!
- int max_tries = 600000;
-
- if (sendcommand(ai, pCmd) == (u16)ERROR)
- return ERROR;
-
- while (max_tries-- && (IN4500(ai, EVSTAT) & EV_CMD) == 0) {
- if (!in_interrupt() && (max_tries & 255) == 0)
- schedule();
- }
- if ( max_tries == -1 ) {
- printk( KERN_ERR
- "airo: Max tries exceeded waiting for command\n" );
- return ERROR;
- }
- completecommand(ai, pRsp);
- return SUCCESS;
+ spin_lock_irqsave(&ai->main_lock, flags);
+ rc = issuecommand_nolock(ai, pCmd, pRsp);
+ spin_unlock_irqrestore(&ai->main_lock, flags);
+ return rc;
}

-static u16 sendcommand(struct airo_info *ai, Cmd *pCmd) {
- // Im really paranoid about letting it run forever!
+static int issuecommand_nolock(struct airo_info *ai, Cmd *pCmd, Resp *pRsp) {
+ // Im really paranoid about letting it run forever!
int max_tries = 600000;
- u16 cmd;

OUT4500(ai, PARAM0, pCmd->parm0);
OUT4500(ai, PARAM1, pCmd->parm1);
OUT4500(ai, PARAM2, pCmd->parm2);
OUT4500(ai, COMMAND, pCmd->cmd);
- while ( max_tries-- && (IN4500(ai, EVSTAT) & EV_CMD) == 0 &&
- (cmd = IN4500(ai, COMMAND)) != 0 )
- if (cmd == pCmd->cmd)
- // PC4500 didn't notice command, try again
- OUT4500(ai, COMMAND, pCmd->cmd);
- if ( max_tries == -1 ) {
+ while (max_tries-- && (IN4500(ai, EVSTAT) & EV_CMD) == 0) {
+ if (IN4500(ai, COMMAND) == pCmd->cmd) {
+ // PC4500 didn't notice command, try again
+ OUT4500(ai, COMMAND, pCmd->cmd);
+ }
+ }
+ if (max_tries == -1) {
printk( KERN_ERR
"airo: Max tries exceeded when issueing command\n" );
return ERROR;
}
- return SUCCESS;
-}
-
-static void completecommand(struct airo_info *ai, Resp *pRsp) {
// command completed
pRsp->status = IN4500(ai, STATUS);
pRsp->rsp0 = IN4500(ai, RESP0);
@@ -2434,8 +2358,10 @@
}
// acknowledge processing the status/response
OUT4500(ai, EVACK, EV_CMD);
+ return SUCCESS;
}

+
/* Sets up the bap to start exchange data. whichbap should
* be one of the BAP0 or BAP1 defines. Locks should be held before
* calling! */
@@ -2500,9 +2426,7 @@
u16 next;
int words;
int i;
- long flags;

- spin_lock_irqsave(&ai->aux_lock, flags);
page = IN4500(ai, SWS0+whichbap);
offset = IN4500(ai, SWS2+whichbap);
next = aux_setup(ai, page, offset, &len);
@@ -2522,7 +2446,6 @@
next = aux_setup(ai, next, 4, &len);
}
}
- spin_unlock_irqrestore(&ai->aux_lock, flags);
return SUCCESS;
}

@@ -2561,7 +2484,7 @@
memset(&cmd, 0, sizeof(cmd));
cmd.cmd = accmd;
cmd.parm0 = rid;
- status = issuecommand(ai, &cmd, &rsp);
+ status = issuecommand_nolock(ai, &cmd, &rsp);
if (status != 0) return status;
if ( (rsp.status & 0x7F00) != 0) {
return (accmd << 8) + (rsp.rsp0 & 0xFF);
@@ -2570,25 +2493,16 @@
}

/* Note, that we are using BAP1 which is also used by transmit, so
- * we must get a lock. */
-static int PC4500_readrid(struct airo_info *ai, u16 rid, void *pBuf, int len)
+ * it must be called with main_lock held. */
+static int PC4500_readrid_nolock(struct airo_info *ai, u16 rid, void *pBuf, int len)
{
- u16 status, dolock = 0;
- int rc = SUCCESS;
+ u16 status;
+
+ if ((status = PC4500_accessrid(ai, rid, CMD_ACCESS)) != SUCCESS)
+ return status;
+ if (bap_setup(ai, rid, 0, BAP1) != SUCCESS)
+ return ERROR;

- if (test_bit(FLAG_LOCKED, &ai->flags) == 0) {
- dolock = 1;
- if (down_interruptible(&ai->sem))
- return ERROR;
- }
- if ( (status = PC4500_accessrid(ai, rid, CMD_ACCESS)) != SUCCESS) {
- rc = status;
- goto done;
- }
- if (bap_setup(ai, rid, 0, BAP1) != SUCCESS) {
- rc = ERROR;
- goto done;
- }
// read the rid length field
bap_read(ai, pBuf, 2, BAP1);
// length for remaining part of rid
@@ -2599,30 +2513,34 @@
"airo: Rid %x has a length of %d which is too short\n",
(int)rid,
(int)len );
- rc = ERROR;
- goto done;
+ return ERROR;
}
// read remainder of the rid
- rc = bap_read(ai, ((u16*)pBuf)+1, len, BAP1);
-done:
- if (dolock)
- up(&ai->sem);
- return rc;
+ return bap_read(ai, ((u16*)pBuf)+1, len, BAP1);
}

+static int PC4500_readrid(struct airo_info *ai, u16 rid, void *pBuf, int len)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&ai->main_lock, flags);
+ rc = PC4500_readrid_nolock(ai, rid, pBuf, len);
+ spin_unlock_irqrestore(&ai->main_lock, flags);
+ return rc;
+}
/* Note, that we are using BAP1 which is also used by transmit, so
* make sure this isnt called when a transmit is happening */
static int PC4500_writerid(struct airo_info *ai, u16 rid,
const void *pBuf, int len)
{
- u16 status, dolock = 0;
+ u16 status;
+ unsigned long flags;
int rc = SUCCESS;

- if (test_bit(FLAG_LOCKED, &ai->flags) == 0) {
- dolock = 1;
- if (down_interruptible(&ai->sem))
- return ERROR;
- }
+ *(u16*)pBuf = cpu_to_le16((u16)len);
+
+ spin_lock_irqsave(&ai->main_lock, flags);
// --- first access so that we can write the rid data
if ( (status = PC4500_accessrid(ai, rid, CMD_ACCESS)) != 0) {
rc = status;
@@ -2636,9 +2554,8 @@
bap_write(ai, pBuf, len, BAP1);
// ---now commit the rid data
rc = PC4500_accessrid(ai, rid, 0x100|CMD_ACCESS);
- done:
- if (dolock)
- up(&ai->sem);
+done:
+ spin_unlock_irqrestore(&ai->main_lock, flags);
return rc;
}

@@ -2646,6 +2563,8 @@
one for now. */
static u16 transmit_allocate(struct airo_info *ai, int lenPayload, int raw)
{
+ unsigned long flags;
+ unsigned int loop = 3000;
Cmd cmd;
Resp rsp;
u16 txFid;
@@ -2653,20 +2572,25 @@

cmd.cmd = CMD_ALLOCATETX;
cmd.parm0 = lenPayload;
- if (down_interruptible(&ai->sem))
- return ERROR;
- if (issuecommand(ai, &cmd, &rsp) != SUCCESS) {
- txFid = 0;
+ spin_lock_irqsave(&ai->main_lock, flags);
+ if (issuecommand_nolock(ai, &cmd, &rsp) != SUCCESS) {
+ txFid = ERROR;
goto done;
}
if ( (rsp.status & 0xFF00) != 0) {
- txFid = 0;
+ txFid = ERROR;
goto done;
}
+
/* wait for the allocate event/indication
- * It makes me kind of nervous that this can just sit here and spin,
- * but in practice it only loops like four times. */
- while ( (IN4500(ai, EVSTAT) & EV_ALLOC) == 0) ;
+ * in practice it only loops like four times. */
+ while (((IN4500(ai, EVSTAT) & EV_ALLOC) == 0) && --loop)
+ ; /* nada */
+ if (!loop) {
+ txFid = ERROR;
+ goto done;
+ }
+
// get the allocated fid and acknowledge
txFid = IN4500(ai, TXALLOCFID);
OUT4500(ai, EVACK, EV_ALLOC);
@@ -2688,7 +2612,7 @@
bap_write(ai, &txControl, sizeof(txControl), BAP1);

done:
- up(&ai->sem);
+ spin_unlock_irqrestore(&ai->main_lock, flags);

return txFid;
}
@@ -2696,17 +2620,14 @@
/* In general BAP1 is dedicated to transmiting packets. However,
since we need a BAP when accessing RIDs, we also use BAP1 for that.
Make sure the BAP1 spinlock is held when this is called. */
-static int transmit_802_3_packet(struct airo_info *ai, int len, char *pPacket)
+static int transmit_802_3_packet(struct airo_info *ai, u16 txFid, char *pPacket, int len)
{
u16 payloadLen;
Cmd cmd;
Resp rsp;
int miclen = 0;
- u16 txFid = len;
MICBuffer pMic;

- len >>= 16;
-
if (len < ETH_ALEN * 2) {
printk( KERN_WARNING "Short packet %d\n", len );
return ERROR;
@@ -2737,12 +2658,12 @@
memset( &cmd, 0, sizeof( cmd ) );
cmd.cmd = CMD_TRANSMIT;
cmd.parm0 = txFid;
- if (issuecommand(ai, &cmd, &rsp) != SUCCESS) return ERROR;
+ if (issuecommand_nolock(ai, &cmd, &rsp) != SUCCESS) return ERROR;
if ( (rsp.status & 0xFF00) != 0) return ERROR;
return SUCCESS;
}

-static int transmit_802_11_packet(struct airo_info *ai, int len, char *pPacket)
+static int transmit_802_11_packet(struct airo_info *ai, u16 txFid, char *pPacket, int len)
{
u16 fc, payloadLen;
Cmd cmd;
@@ -2753,8 +2674,6 @@
u16 gaplen;
u8 gap[6];
} gap;
- u16 txFid = len;
- len >>= 16;
gap.gaplen = 6;

fc = le16_to_cpu(*(const u16*)pPacket);
@@ -2796,7 +2715,7 @@
memset( &cmd, 0, sizeof( cmd ) );
cmd.cmd = CMD_TRANSMIT;
cmd.parm0 = txFid;
- if (issuecommand(ai, &cmd, &rsp) != SUCCESS) return ERROR;
+ if (issuecommand_nolock(ai, &cmd, &rsp) != SUCCESS) return ERROR;
if ( (rsp.status & 0xFF00) != 0) return ERROR;
return SUCCESS;
}
@@ -3865,10 +3784,7 @@

memset(&cmd, 0, sizeof(cmd));
cmd.cmd=CMD_LISTBSS;
- if (down_interruptible(&ai->sem))
- return -ERESTARTSYS;
issuecommand(ai, &cmd, &rsp);
- up(&ai->sem);
data->readlen = 0;
return 0;
}
@@ -3932,13 +3848,6 @@

if (!(apriv->flags & FLAG_FLASHING) && (linkstat != 0x400)) {
/* We don't have a link so try changing the authtype */
- if (down_trylock(&apriv->sem) != 0) {
- apriv->timer.expires = RUN_AT(1);
- add_timer(&apriv->timer);
- return;
- }
- __set_bit(FLAG_LOCKED, &apriv->flags);
-
readConfigRid(apriv);
disable_MAC(apriv);
switch(apriv->config.authType) {
@@ -3964,8 +3873,6 @@
apriv->need_commit = 1;
writeConfigRid(apriv);
enable_MAC(apriv, &rsp);
- clear_bit(FLAG_LOCKED, &apriv->flags);
- up(&apriv->sem);

/* Schedule check to see if the change worked */
apriv->timer.expires = RUN_AT(HZ*3);
@@ -4144,7 +4051,11 @@
struct airo_info *local = dev->priv;
StatusRid status_rid; /* Card status info */

- readStatusRid(local, &status_rid);
+ if (local->config.opmode & MODE_STA_ESS)
+ status_rid.channel = local->config.channelSet;
+ else
+ readStatusRid(local, &status_rid);
+

/* Will return zero in infrastructure mode */
#ifdef WEXT_USECHANNELS
@@ -4255,11 +4166,8 @@
return -EINVAL;
else if (!memcmp(bcast, awrq->sa_data, ETH_ALEN)) {
memset(&cmd, 0, sizeof(cmd));
- cmd.cmd=CMD_LOSE_SYNC;
- if (down_interruptible(&local->sem))
- return -ERESTARTSYS;
+ cmd.cmd = CMD_LOSE_SYNC;
issuecommand(local, &cmd, &rsp);
- up(&local->sem);
} else {
memset(&APList_rid, 0, sizeof(APList_rid));
APList_rid.len = sizeof(APList_rid);
@@ -5141,11 +5049,8 @@
/* Initiate a scan command */
memset(&cmd, 0, sizeof(cmd));
cmd.cmd=CMD_LISTBSS;
- if (down_interruptible(&ai->sem))
- return -ERESTARTSYS;
issuecommand(ai, &cmd, &rsp);
ai->scan_timestamp = jiffies;
- up(&ai->sem);

/* At this point, just return to the user. */



2003-07-21 11:52:39

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c


Daniel,

Thank you for your patch. Some comments about it:

- I'd rather fix whatever is broken in the current code than going back to
spinlocks, as they increase latency and reduce concurrency. In any case,
please check your code. I've seen a spinlock in the interrupt handler that
may lock the system.
- The fix for the transmit code you mention, is about fixing the returned
value in case of error? If not, please explain it to me as I don't see any
other changes.
- Where did you fix a buffer overflow?

I submitted to Jeff an updated version just before you sent your e-mail.
It incorporates most of your fixes expect for the possible loop-forever.
It's more stable that the one in the current kernel tree.

Javier Achirica

On Fri, 18 Jul 2003, Daniel Ritz wrote:

> in 2.4.20+ airo.c is broken and can even kill keventd. this patch fixes it:
> - sane locking with spinlocks and irqs disabled instead of the buggy locking
> with semaphores
> - fix transmit code
> - safer unload ordering of disable irq, unregister_netdev, kfree
> - fix possible loop-forever bug
> - fix a buffer overflow
>
> a kernel 2.4 version of the patch is tested by some people with good results.
> against 2.6.0-test1-bk. please apply.
>
>
> rgds
> -daniel

2003-07-21 12:24:35

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c

On Mon, Jul 21, 2003 at 01:00:54PM +0200, Javier Achirica wrote:
>
> Daniel,
>
> Thank you for your patch. Some comments about it:
>
> - I'd rather fix whatever is broken in the current code than going back to
> spinlocks, as they increase latency and reduce concurrency. In any case,
> please check your code.

In general we prefer spinlocks in linux for drivers unless there's a
very good reason against it. If you have latency or concurrency problems
it seems you have problems with your algorithms or the length of your
critical sections.

2003-07-21 13:32:18

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c



On Mon, 21 Jul 2003, Christoph Hellwig wrote:

> On Mon, Jul 21, 2003 at 01:00:54PM +0200, Javier Achirica wrote:
> >
> > Daniel,
> >
> > Thank you for your patch. Some comments about it:
> >
> > - I'd rather fix whatever is broken in the current code than going back to
> > spinlocks, as they increase latency and reduce concurrency. In any case,
> > please check your code.
>
> In general we prefer spinlocks in linux for drivers unless there's a
> very good reason against it. If you have latency or concurrency problems
> it seems you have problems with your algorithms or the length of your
> critical sections.

I don't think it is a problem with the algorithms. It is, of course, with
the length of the critical sections, due to the fact that the Aironet
firmware needs serialized commands and some of them take a long time to
execute. To ensure serialization, a spinlock was used, but in that case it
could be hold for hundreds of milliseconds in some cases (waiting for the
command to finish).

If you have any suggestion about how to better deal with that issue, I'd
be happy to hear it.

Javier Achirica

2003-07-21 15:01:38

by Mike Kershaw

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c

Grargh. At work all weekend. Didn't have time to make a real patch.

Anyhow - Proposed changes to airo.c for rfmonitor mode. I've been working
on making it quiet (not continually probing) and on making it enter "true"
rfmon mode (right now it doesn't, which is why user-controlled channel
hopping doesn't work). Both of these are "special case" stuff that doesn't
affect normal users, but are near and dear to my own pursuits. :P

I've succeeded in the first, but not the second, so I hadn't released any
changed driver code. The first is actually a very simple change - in the
code block that puts the driver into rfmon mode (around line 2480 on my
code) after setting:
config.rmode = RXMODE_RFMON | RXMODE_DISABLE_802_3_HEADER;
or
config.rmode = RXMODE_RFMON_ANYBSS | RXMODE_DISABLE_802_3_HEADER;

it should set:
config.scanMode = SCANMODE_PASSIVE;

and in the code block exiting rfmon mode, after:
config.opmode = 0;
it should set:
config.scanMode = SCANMODE_ACTIVE;

With my testing this stops the probing in rfmon (good) and doesn't have any
negative impacts. More testing is, of course, a good idea.

I can try to come up with an exact diff but I figured I should get something
out while everyone is discussing changes, and I don't know how much time I'm
going to have in the coming weeks.

-m

--
I like my coffee like I like my friends -- Dark, and bitter.


Attachments:
(No filename) (1.34 kB)
(No filename) (189.00 B)
Download all attachments

2003-07-21 17:35:17

by Daniel Ritz

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c

On Mon July 21 2003 13:00, Javier Achirica wrote:
>
> Daniel,
>
> Thank you for your patch. Some comments about it:
>
> - I'd rather fix whatever is broken in the current code than going back to
> spinlocks, as they increase latency and reduce concurrency. In any case,
> please check your code. I've seen a spinlock in the interrupt handler that
> may lock the system.

but we need to protect from interrupts while accessing the card and waiting for
completion. semaphores don't protect you from that. spin_lock_irqsave does. the
spin_lock in the interrupt handler is there to protect from interrupts from
other processors in a SMP system (see Documentation/spinlocks.txt) and is btw.
a no-op on UP. and semaphores are quite heavy....

> - The fix for the transmit code you mention, is about fixing the returned
> value in case of error? If not, please explain it to me as I don't see any
> other changes.

fixes:
- return values
- when to free the skb, when not
- disabling the queues
- netif_wake_queue called from the interrupt handler only (and on the right
net_device)
- i think the priv->xmit stuff and then the schedule_work is evil:
if you return 0 from the dev->hard_start_xmit then the network layer assumes
that the packet was kfree_skb()'ed (which does only frees the packet when the
refcount drops to zero.) this is the cause for the keventd killing, for sure!

if you return 0 you already kfree_skb()'ed the packet. and that's it.


> - Where did you fix a buffer overflow?

- for(s = &statr->beaconPeriod; s <= &statr->_reserved[9]; s++)
+ for(s = &statr->beaconPeriod; s <= &statr->_reserved1; s++)

...which you also fixed in your patchset. (which is harmless on little-endian
machines and evil on big-endian machines)


rgds
-daniel

>
> I submitted to Jeff an updated version just before you sent your e-mail.
> It incorporates most of your fixes expect for the possible loop-forever.
> It's more stable that the one in the current kernel tree.
>
> Javier Achirica
>
> On Fri, 18 Jul 2003, Daniel Ritz wrote:
>
> > in 2.4.20+ airo.c is broken and can even kill keventd. this patch fixes it:
> > - sane locking with spinlocks and irqs disabled instead of the buggy locking
> > with semaphores
> > - fix transmit code
> > - safer unload ordering of disable irq, unregister_netdev, kfree
> > - fix possible loop-forever bug
> > - fix a buffer overflow
> >
> > a kernel 2.4 version of the patch is tested by some people with good results.
> > against 2.6.0-test1-bk. please apply.
> >
> >
> > rgds
> > -daniel
>
>

2003-07-21 18:42:18

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c


Understood. I'm checking that change in my code and I'll update it in the
CVS (before commiting to the kernel tree).B

On Mon, 21 Jul 2003, Mike Kershaw wrote:

> Grargh. At work all weekend. Didn't have time to make a real patch.
>
> Anyhow - Proposed changes to airo.c for rfmonitor mode. I've been working
> on making it quiet (not continually probing) and on making it enter "true"
> rfmon mode (right now it doesn't, which is why user-controlled channel
> hopping doesn't work). Both of these are "special case" stuff that doesn't
> affect normal users, but are near and dear to my own pursuits. :P
>
> I've succeeded in the first, but not the second, so I hadn't released any
> changed driver code. The first is actually a very simple change - in the
> code block that puts the driver into rfmon mode (around line 2480 on my
> code) after setting:
> config.rmode = RXMODE_RFMON | RXMODE_DISABLE_802_3_HEADER;
> or
> config.rmode = RXMODE_RFMON_ANYBSS | RXMODE_DISABLE_802_3_HEADER;
>
> it should set:
> config.scanMode = SCANMODE_PASSIVE;
>
> and in the code block exiting rfmon mode, after:
> config.opmode = 0;
> it should set:
> config.scanMode = SCANMODE_ACTIVE;
>
> With my testing this stops the probing in rfmon (good) and doesn't have any
> negative impacts. More testing is, of course, a good idea.
>
> I can try to come up with an exact diff but I figured I should get something
> out while everyone is discussing changes, and I don't know how much time I'm
> going to have in the coming weeks.
>
> -m
>
> --
> I like my coffee like I like my friends -- Dark, and bitter.
>

2003-07-21 19:29:58

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c



On Mon, 21 Jul 2003, Daniel Ritz wrote:

> On Mon July 21 2003 13:00, Javier Achirica wrote:
> >
> > Daniel,
> >
> > Thank you for your patch. Some comments about it:
> >
> > - I'd rather fix whatever is broken in the current code than going back to
> > spinlocks, as they increase latency and reduce concurrency. In any case,
> > please check your code. I've seen a spinlock in the interrupt handler that
> > may lock the system.
>
> but we need to protect from interrupts while accessing the card and waiting for
> completion. semaphores don't protect you from that. spin_lock_irqsave does. the
> spin_lock in the interrupt handler is there to protect from interrupts from
> other processors in a SMP system (see Documentation/spinlocks.txt) and is btw.
> a no-op on UP. and semaphores are quite heavy....

Not really. You can still read the received packets from the card (as
you're not issuing any command and are using the other BAP) while a
command is in progress. There are some specific cases in which you need
to have protection, and that cases are avoided with the down_trylock.

AFAIK, interrupt serialization is assured by the interrupt handler, so you
don't need to do that.

> > - The fix for the transmit code you mention, is about fixing the returned
> > value in case of error? If not, please explain it to me as I don't see any
> > other changes.
>
> fixes:
> - return values
> - when to free the skb, when not
> - disabling the queues
> - netif_wake_queue called from the interrupt handler only (and on the right
> net_device)
> - i think the priv->xmit stuff and then the schedule_work is evil:
> if you return 0 from the dev->hard_start_xmit then the network layer assumes
> that the packet was kfree_skb()'ed (which does only frees the packet when the
> refcount drops to zero.) this is the cause for the keventd killing, for sure!
>
> if you return 0 you already kfree_skb()'ed the packet. and that's it.

This is where I have the biggest problems. As I've read in
Documentation/networking/driver.txt, looks like the packet needs to be
freed "soon", but doesn't require to be before returning 0 in
hard_start_xmit. Did I get it wrong?

Thanks for your help,
Javier Achirica


2003-07-21 20:46:38

by Daniel Ritz

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c

On Mon July 21 2003 21:44, Javier Achirica wrote:
>
> On Mon, 21 Jul 2003, Daniel Ritz wrote:
>
> > On Mon July 21 2003 13:00, Javier Achirica wrote:
> > >
> > > Daniel,
> > >
> > > Thank you for your patch. Some comments about it:
> > >
> > > - I'd rather fix whatever is broken in the current code than going back to
> > > spinlocks, as they increase latency and reduce concurrency. In any case,
> > > please check your code. I've seen a spinlock in the interrupt handler that
> > > may lock the system.
> >
> > but we need to protect from interrupts while accessing the card and waiting for
> > completion. semaphores don't protect you from that. spin_lock_irqsave does. the
> > spin_lock in the interrupt handler is there to protect from interrupts from
> > other processors in a SMP system (see Documentation/spinlocks.txt) and is btw.
> > a no-op on UP. and semaphores are quite heavy....
>
> Not really. You can still read the received packets from the card (as
> you're not issuing any command and are using the other BAP) while a
> command is in progress. There are some specific cases in which you need
> to have protection, and that cases are avoided with the down_trylock.
>

ok, i think i have to look closer...if the card can handle that then we don't need
to irq-protect all the areas i did protect...but i do think that those down_trylock and
then the schedule_work should be replaced by a simple spinlock_irq_save...

i look closer at it tomorrow.
you happen to have the tech spec lying aroung?

> AFAIK, interrupt serialization is assured by the interrupt handler, so you
> don't need to do that.
>
> > > - The fix for the transmit code you mention, is about fixing the returned
> > > value in case of error? If not, please explain it to me as I don't see any
> > > other changes.
> >
> > fixes:
> > - return values
> > - when to free the skb, when not
> > - disabling the queues
> > - netif_wake_queue called from the interrupt handler only (and on the right
> > net_device)
> > - i think the priv->xmit stuff and then the schedule_work is evil:
> > if you return 0 from the dev->hard_start_xmit then the network layer assumes
> > that the packet was kfree_skb()'ed (which does only frees the packet when the
> > refcount drops to zero.) this is the cause for the keventd killing, for sure!
> >
> > if you return 0 you already kfree_skb()'ed the packet. and that's it.
>
> This is where I have the biggest problems. As I've read in
> Documentation/networking/driver.txt, looks like the packet needs to be
> freed "soon", but doesn't require to be before returning 0 in
> hard_start_xmit. Did I get it wrong?
>

no, i got it wrong. but still...it's the xmit where the oops comes from....

wait. isn't there a race in airo_do_xmit? at high xfer rates (when it oopses) the
queue can wake right after it is stopped in the down_trylock section. so you can
happen to loose an skb 'cos the write to priv->xmit is not protected at all and
there should be a check so that only one skb can be queue there. no?
(and then the irq-handler can wake the queue too)

ok, i think i got it now. i'll do a new patch tomorrow or so that tries:
- to fix the transmit not to oops
- to avoid disabling the irq's whenever possible
- using spinlocks instead of the heavier semaphores ('cos i think if it's done cleaner
than i did it now, it's faster than the semas, and to make hch happy :)


> Thanks for your help,
> Javier Achirica
>

rgds
-daniel

2003-07-21 21:10:18

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c



On Mon, 21 Jul 2003, Daniel Ritz wrote:

> On Mon July 21 2003 21:44, Javier Achirica wrote:
> >
> > On Mon, 21 Jul 2003, Daniel Ritz wrote:
> >
> > > On Mon July 21 2003 13:00, Javier Achirica wrote:
> > > >
> > > > Daniel,
> > > >
> > > > Thank you for your patch. Some comments about it:
> > > >
> > > > - I'd rather fix whatever is broken in the current code than going back to
> > > > spinlocks, as they increase latency and reduce concurrency. In any case,
> > > > please check your code. I've seen a spinlock in the interrupt handler that
> > > > may lock the system.
> > >
> > > but we need to protect from interrupts while accessing the card and waiting for
> > > completion. semaphores don't protect you from that. spin_lock_irqsave does. the
> > > spin_lock in the interrupt handler is there to protect from interrupts from
> > > other processors in a SMP system (see Documentation/spinlocks.txt) and is btw.
> > > a no-op on UP. and semaphores are quite heavy....
> >
> > Not really. You can still read the received packets from the card (as
> > you're not issuing any command and are using the other BAP) while a
> > command is in progress. There are some specific cases in which you need
> > to have protection, and that cases are avoided with the down_trylock.
> >
>
> ok, i think i have to look closer...if the card can handle that then we don't need
> to irq-protect all the areas i did protect...but i do think that those down_trylock and
> then the schedule_work should be replaced by a simple spinlock_irq_save...
>
> i look closer at it tomorrow.
> you happen to have the tech spec lying aroung?

I have an old one, but I don't think that I'm allowed (by Cisco) to pass
it around.

> > AFAIK, interrupt serialization is assured by the interrupt handler, so you
> > don't need to do that.
> >
> > > > - The fix for the transmit code you mention, is about fixing the returned
> > > > value in case of error? If not, please explain it to me as I don't see any
> > > > other changes.
> > >
> > > fixes:
> > > - return values
> > > - when to free the skb, when not
> > > - disabling the queues
> > > - netif_wake_queue called from the interrupt handler only (and on the right
> > > net_device)
> > > - i think the priv->xmit stuff and then the schedule_work is evil:
> > > if you return 0 from the dev->hard_start_xmit then the network layer assumes
> > > that the packet was kfree_skb()'ed (which does only frees the packet when the
> > > refcount drops to zero.) this is the cause for the keventd killing, for sure!
> > >
> > > if you return 0 you already kfree_skb()'ed the packet. and that's it.
> >
> > This is where I have the biggest problems. As I've read in
> > Documentation/networking/driver.txt, looks like the packet needs to be
> > freed "soon", but doesn't require to be before returning 0 in
> > hard_start_xmit. Did I get it wrong?
> >
>
> no, i got it wrong. but still...it's the xmit where the oops comes from....
>
> wait. isn't there a race in airo_do_xmit? at high xfer rates (when it oopses) the
> queue can wake right after it is stopped in the down_trylock section. so you can
> happen to loose an skb 'cos the write to priv->xmit is not protected at all and
> there should be a check so that only one skb can be queue there. no?
> (and then the irq-handler can wake the queue too)
>
> ok, i think i got it now. i'll do a new patch tomorrow or so that tries:
> - to fix the transmit not to oops
> - to avoid disabling the irq's whenever possible
> - using spinlocks instead of the heavier semaphores ('cos i think if it's done cleaner
> than i did it now, it's faster than the semas, and to make hch happy :)

Yes! This is the race that may be causing problems. I'll try to fix it and
we may compare semaphore vs. spinlock implementation.

Javier Achirica

2003-07-22 08:53:12

by Javier Achirica

[permalink] [raw]
Subject: Re: [PATCH 2.5] fixes for airo.c


Today I updated the CVS and Sourceforge (airo-linux.sf.net) with the
latest version (1.53) that (I hope) fixes the race problem. If everything
is fine, I'll commit the changes to the kernel tree.

Javier Achirica

On Mon, 21 Jul 2003, Daniel Ritz wrote:

> On Mon July 21 2003 21:44, Javier Achirica wrote:
> >
> > On Mon, 21 Jul 2003, Daniel Ritz wrote:
> >
> > > On Mon July 21 2003 13:00, Javier Achirica wrote:
> > > >
> > > > Daniel,
> > > >
> > > > Thank you for your patch. Some comments about it:
> > > >
> > > > - I'd rather fix whatever is broken in the current code than going back to
> > > > spinlocks, as they increase latency and reduce concurrency. In any case,
> > > > please check your code. I've seen a spinlock in the interrupt handler that
> > > > may lock the system.
> > >
> > > but we need to protect from interrupts while accessing the card and waiting for
> > > completion. semaphores don't protect you from that. spin_lock_irqsave does. the
> > > spin_lock in the interrupt handler is there to protect from interrupts from
> > > other processors in a SMP system (see Documentation/spinlocks.txt) and is btw.
> > > a no-op on UP. and semaphores are quite heavy....
> >
> > Not really. You can still read the received packets from the card (as
> > you're not issuing any command and are using the other BAP) while a
> > command is in progress. There are some specific cases in which you need
> > to have protection, and that cases are avoided with the down_trylock.
> >
>
> ok, i think i have to look closer...if the card can handle that then we don't need
> to irq-protect all the areas i did protect...but i do think that those down_trylock and
> then the schedule_work should be replaced by a simple spinlock_irq_save...
>
> i look closer at it tomorrow.
> you happen to have the tech spec lying aroung?
>
> > AFAIK, interrupt serialization is assured by the interrupt handler, so you
> > don't need to do that.
> >
> > > > - The fix for the transmit code you mention, is about fixing the returned
> > > > value in case of error? If not, please explain it to me as I don't see any
> > > > other changes.
> > >
> > > fixes:
> > > - return values
> > > - when to free the skb, when not
> > > - disabling the queues
> > > - netif_wake_queue called from the interrupt handler only (and on the right
> > > net_device)
> > > - i think the priv->xmit stuff and then the schedule_work is evil:
> > > if you return 0 from the dev->hard_start_xmit then the network layer assumes
> > > that the packet was kfree_skb()'ed (which does only frees the packet when the
> > > refcount drops to zero.) this is the cause for the keventd killing, for sure!
> > >
> > > if you return 0 you already kfree_skb()'ed the packet. and that's it.
> >
> > This is where I have the biggest problems. As I've read in
> > Documentation/networking/driver.txt, looks like the packet needs to be
> > freed "soon", but doesn't require to be before returning 0 in
> > hard_start_xmit. Did I get it wrong?
> >
>
> no, i got it wrong. but still...it's the xmit where the oops comes from....
>
> wait. isn't there a race in airo_do_xmit? at high xfer rates (when it oopses) the
> queue can wake right after it is stopped in the down_trylock section. so you can
> happen to loose an skb 'cos the write to priv->xmit is not protected at all and
> there should be a check so that only one skb can be queue there. no?
> (and then the irq-handler can wake the queue too)
>
> ok, i think i got it now. i'll do a new patch tomorrow or so that tries:
> - to fix the transmit not to oops
> - to avoid disabling the irq's whenever possible
> - using spinlocks instead of the heavier semaphores ('cos i think if it's done cleaner
> than i did it now, it's faster than the semas, and to make hch happy :)
>
>
> > Thanks for your help,
> > Javier Achirica
> >
>
> rgds
> -daniel
>
>
>