The netif_stop_queue() call in sonic_send_packet() races with the
netif_wake_queue() call in sonic_interrupt(). This causes issues
like "NETDEV WATCHDOG: eth0 (macsonic): transmit queue 0 timed out".
Fix this by disabling interrupts when accessing tx_skb[] and next_tx.
Update a comment to clarify the synchronization properties.
Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update")
Tested-by: Stan Johnson <[email protected]>
Signed-off-by: Finn Thain <[email protected]>
---
Changed since v1:
- Added spin lock, as requested by Geert.
---
drivers/net/ethernet/natsemi/sonic.c | 44 +++++++++++++++++++---------
drivers/net/ethernet/natsemi/sonic.h | 1 +
2 files changed, 31 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/natsemi/sonic.c b/drivers/net/ethernet/natsemi/sonic.c
index b339125b2f09..9e99ba57f86a 100644
--- a/drivers/net/ethernet/natsemi/sonic.c
+++ b/drivers/net/ethernet/natsemi/sonic.c
@@ -64,6 +64,8 @@ static int sonic_open(struct net_device *dev)
netif_dbg(lp, ifup, dev, "%s: initializing sonic driver\n", __func__);
+ spin_lock_init(&lp->lock);
+
for (i = 0; i < SONIC_NUM_RRS; i++) {
struct sk_buff *skb = netdev_alloc_skb(dev, SONIC_RBSIZE + 2);
if (skb == NULL) {
@@ -206,8 +208,6 @@ static void sonic_tx_timeout(struct net_device *dev)
* wake the tx queue
* Concurrently with all of this, the SONIC is potentially writing to
* the status flags of the TDs.
- * Until some mutual exclusion is added, this code will not work with SMP. However,
- * MIPS Jazz machines and m68k Macs were all uni-processor machines.
*/
static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
@@ -215,7 +215,8 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
struct sonic_local *lp = netdev_priv(dev);
dma_addr_t laddr;
int length;
- int entry = lp->next_tx;
+ int entry;
+ unsigned long flags;
netif_dbg(lp, tx_queued, dev, "%s: skb=%p\n", __func__, skb);
@@ -237,6 +238,10 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
+ spin_lock_irqsave(&lp->lock, flags);
+
+ entry = lp->next_tx;
+
sonic_tda_put(dev, entry, SONIC_TD_STATUS, 0); /* clear status */
sonic_tda_put(dev, entry, SONIC_TD_FRAG_COUNT, 1); /* single fragment */
sonic_tda_put(dev, entry, SONIC_TD_PKTSIZE, length); /* length of packet */
@@ -246,10 +251,6 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
sonic_tda_put(dev, entry, SONIC_TD_LINK,
sonic_tda_get(dev, entry, SONIC_TD_LINK) | SONIC_EOL);
- /*
- * Must set tx_skb[entry] only after clearing status, and
- * before clearing EOL and before stopping queue
- */
wmb();
lp->tx_len[entry] = length;
lp->tx_laddr[entry] = laddr;
@@ -272,6 +273,8 @@ static int sonic_send_packet(struct sk_buff *skb, struct net_device *dev)
SONIC_WRITE(SONIC_CMD, SONIC_CR_TXP);
+ spin_unlock_irqrestore(&lp->lock, flags);
+
return NETDEV_TX_OK;
}
@@ -284,9 +287,16 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
struct net_device *dev = dev_id;
struct sonic_local *lp = netdev_priv(dev);
int status;
+ unsigned long flags;
+
+ spin_lock_irqsave(&lp->lock, flags);
+
+ status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT;
+ if (!status) {
+ spin_unlock_irqrestore(&lp->lock, flags);
- if (!(status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT))
return IRQ_NONE;
+ }
do {
if (status & SONIC_INT_PKTRX) {
@@ -300,11 +310,12 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
int td_status;
int freed_some = 0;
- /* At this point, cur_tx is the index of a TD that is one of:
- * unallocated/freed (status set & tx_skb[entry] clear)
- * allocated and sent (status set & tx_skb[entry] set )
- * allocated and not yet sent (status clear & tx_skb[entry] set )
- * still being allocated by sonic_send_packet (status clear & tx_skb[entry] clear)
+ /* The state of a Transmit Descriptor may be inferred
+ * from { tx_skb[entry], td_status } as follows.
+ * { clear, clear } => the TD has never been used
+ * { set, clear } => the TD was handed to SONIC
+ * { set, set } => the TD was handed back
+ * { clear, set } => the TD is available for re-use
*/
netif_dbg(lp, intr, dev, "%s: tx done\n", __func__);
@@ -406,7 +417,12 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
/* load CAM done */
if (status & SONIC_INT_LCD)
SONIC_WRITE(SONIC_ISR, SONIC_INT_LCD); /* clear the interrupt */
- } while((status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT));
+
+ status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT;
+ } while (status);
+
+ spin_unlock_irqrestore(&lp->lock, flags);
+
return IRQ_HANDLED;
}
diff --git a/drivers/net/ethernet/natsemi/sonic.h b/drivers/net/ethernet/natsemi/sonic.h
index 2b27f7049acb..f9506863e9d1 100644
--- a/drivers/net/ethernet/natsemi/sonic.h
+++ b/drivers/net/ethernet/natsemi/sonic.h
@@ -322,6 +322,7 @@ struct sonic_local {
int msg_enable;
struct device *device; /* generic device */
struct net_device_stats stats;
+ spinlock_t lock;
};
#define TX_TIMEOUT (3 * HZ)
--
2.24.1
On 1/21/20 1:22 PM, Finn Thain wrote:
> The netif_stop_queue() call in sonic_send_packet() races with the
> netif_wake_queue() call in sonic_interrupt(). This causes issues
> like "NETDEV WATCHDOG: eth0 (macsonic): transmit queue 0 timed out".
> Fix this by disabling interrupts when accessing tx_skb[] and next_tx.
> Update a comment to clarify the synchronization properties.
>
> Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update")
> Tested-by: Stan Johnson <[email protected]>
> Signed-off-by: Finn Thain <[email protected]>
> @@ -284,9 +287,16 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
> struct net_device *dev = dev_id;
> struct sonic_local *lp = netdev_priv(dev);
> int status;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&lp->lock, flags);
This is a hard irq handler, no need to block hard irqs.
spin_lock() here is enough.
> +
> + status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT;
> + if (!status) {
> + spin_unlock_irqrestore(&lp->lock, flags);
>
> - if (!(status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT))
> return IRQ_NONE;
> + }
>
> do {
> if (status & SONIC_INT_PKTRX) {
> @@ -300,11 +310,12 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
> int td_status;
> int freed_some = 0;
>
> - /* At this point, cur_tx is the index of a TD that is one of:
> - * unallocated/freed (status set & tx_skb[entry] clear)
> - * allocated and sent (status set & tx_skb[entry] set )
> - * allocated and not yet sent (status clear & tx_skb[entry] set )
> - * still being allocated by sonic_send_packet (status clear & tx_skb[entry] clear)
> + /* The state of a Transmit Descriptor may be inferred
> + * from { tx_skb[entry], td_status } as follows.
> + * { clear, clear } => the TD has never been used
> + * { set, clear } => the TD was handed to SONIC
> + * { set, set } => the TD was handed back
> + * { clear, set } => the TD is available for re-use
> */
>
> netif_dbg(lp, intr, dev, "%s: tx done\n", __func__);
> @@ -406,7 +417,12 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
> /* load CAM done */
> if (status & SONIC_INT_LCD)
> SONIC_WRITE(SONIC_ISR, SONIC_INT_LCD); /* clear the interrupt */
> - } while((status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT));
> +
> + status = SONIC_READ(SONIC_ISR) & SONIC_IMR_DEFAULT;
> + } while (status);
> +
> + spin_unlock_irqrestore(&lp->lock, flags);
> +
> return IRQ_HANDLED;
On Tue, 21 Jan 2020, Eric Dumazet wrote:
> On 1/21/20 1:22 PM, Finn Thain wrote:
> > The netif_stop_queue() call in sonic_send_packet() races with the
> > netif_wake_queue() call in sonic_interrupt(). This causes issues
> > like "NETDEV WATCHDOG: eth0 (macsonic): transmit queue 0 timed out".
> > Fix this by disabling interrupts when accessing tx_skb[] and next_tx.
> > Update a comment to clarify the synchronization properties.
> >
> > Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update")
> > Tested-by: Stan Johnson <[email protected]>
> > Signed-off-by: Finn Thain <[email protected]>
>
> > @@ -284,9 +287,16 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
> > struct net_device *dev = dev_id;
> > struct sonic_local *lp = netdev_priv(dev);
> > int status;
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&lp->lock, flags);
>
>
> This is a hard irq handler, no need to block hard irqs.
>
> spin_lock() here is enough.
>
Well, yes, assuming we're dealing with SMP [1]. Probably just disabling
pre-emption is all that will ever be needed.
Anyway, the real problem solved by disabling irqs is that macsonic must
avoid re-entrance of sonic_interrupt(). [2]
[1]
https://lore.kernel.org/netdev/[email protected]/T/#m0523c8b2a26a410ed56889d9230c37ba1160d40a
[2]
https://lore.kernel.org/netdev/[email protected]/T/#m1c8ca580d2b45e61a628d17839978d0bd5aaf061
On 1/21/20 3:33 PM, Finn Thain wrote:
> On Tue, 21 Jan 2020, Eric Dumazet wrote:
>
>> On 1/21/20 1:22 PM, Finn Thain wrote:
>>> The netif_stop_queue() call in sonic_send_packet() races with the
>>> netif_wake_queue() call in sonic_interrupt(). This causes issues
>>> like "NETDEV WATCHDOG: eth0 (macsonic): transmit queue 0 timed out".
>>> Fix this by disabling interrupts when accessing tx_skb[] and next_tx.
>>> Update a comment to clarify the synchronization properties.
>>>
>>> Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update")
>>> Tested-by: Stan Johnson <[email protected]>
>>> Signed-off-by: Finn Thain <[email protected]>
>>
>>> @@ -284,9 +287,16 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
>>> struct net_device *dev = dev_id;
>>> struct sonic_local *lp = netdev_priv(dev);
>>> int status;
>>> + unsigned long flags;
>>> +
>>> + spin_lock_irqsave(&lp->lock, flags);
>>
>>
>> This is a hard irq handler, no need to block hard irqs.
>>
>> spin_lock() here is enough.
>>
>
> Well, yes, assuming we're dealing with SMP [1]. Probably just disabling
> pre-emption is all that will ever be needed.
>
> Anyway, the real problem solved by disabling irqs is that macsonic must
> avoid re-entrance of sonic_interrupt(). [2]
>
> [1]
> https://lore.kernel.org/netdev/[email protected]/T/#m0523c8b2a26a410ed56889d9230c37ba1160d40a
>
> [2]
> https://lore.kernel.org/netdev/[email protected]/T/#m1c8ca580d2b45e61a628d17839978d0bd5aaf061
>
Oh well...
I would rather keep the m68k specific wrapper...
Please add a fat comment of why spin_lock_irqsave() is used,
to avoid a future 'cleanup', since average network developer
wont be aware of m68k oddities.
On Tue, 21 Jan 2020, Eric Dumazet wrote:
> On 1/21/20 3:33 PM, Finn Thain wrote:
> > On Tue, 21 Jan 2020, Eric Dumazet wrote:
> >
> >> On 1/21/20 1:22 PM, Finn Thain wrote:
> >>> The netif_stop_queue() call in sonic_send_packet() races with the
> >>> netif_wake_queue() call in sonic_interrupt(). This causes issues
> >>> like "NETDEV WATCHDOG: eth0 (macsonic): transmit queue 0 timed out".
> >>> Fix this by disabling interrupts when accessing tx_skb[] and next_tx.
> >>> Update a comment to clarify the synchronization properties.
> >>>
> >>> Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update")
> >>> Tested-by: Stan Johnson <[email protected]>
> >>> Signed-off-by: Finn Thain <[email protected]>
> >>
> >>> @@ -284,9 +287,16 @@ static irqreturn_t sonic_interrupt(int irq, void *dev_id)
> >>> struct net_device *dev = dev_id;
> >>> struct sonic_local *lp = netdev_priv(dev);
> >>> int status;
> >>> + unsigned long flags;
> >>> +
> >>> + spin_lock_irqsave(&lp->lock, flags);
> >>
> >>
> >> This is a hard irq handler, no need to block hard irqs.
> >>
> >> spin_lock() here is enough.
> >>
> >
> > Well, yes, assuming we're dealing with SMP [1]. Probably just disabling
> > pre-emption is all that will ever be needed.
> >
> > Anyway, the real problem solved by disabling irqs is that macsonic must
> > avoid re-entrance of sonic_interrupt(). [2]
> >
> > [1]
> > https://lore.kernel.org/netdev/[email protected]/T/#m0523c8b2a26a410ed56889d9230c37ba1160d40a
> >
> > [2]
> > https://lore.kernel.org/netdev/[email protected]/T/#m1c8ca580d2b45e61a628d17839978d0bd5aaf061
> >
>
I was overlooking the fact that sonic_send_packet() really does have to
exclude sonic_interrupt(). So disabling irqs is mandatory here, right?
> Oh well...
>
> I would rather keep the m68k specific wrapper...
>
If disabling irqs is unavoidable, the wrapper is redundant.
> Please add a fat comment of why spin_lock_irqsave() is used, to avoid a
> future 'cleanup', since average network developer wont be aware of m68k
> oddities.
>
OK.