From: qianfan Zhao <[email protected]>
emac_timeout() callback acquires lock and so does emac_init_device(),
which called inside lock protected region. This hangs the system and
produces following warning:
WARNING: CPU: 2 PID: 0 at net/sched/sch_generic.c:478
dev_watchdog+0x2e4/0x2e8
NETDEV WATCHDOG: FE0 (sun4i-emac): transmit queue 0 timed out
Modules linked in:
CPU: 2 PID: 0 Comm: swapper/2 Tainted: G W 5.15.0-00047-g0848e4aeb313
Hardware name: Wisdom T3 based CCT Family
[<c010f740>] (unwind_backtrace) from [<c010b744>] (show_stack+0x10/0x14)
[<c010b744>] (show_stack) from [<c0a4d978>] (dump_stack_lvl+0x40/0x4c)
[<c0a4d978>] (dump_stack_lvl) from [<c0120734>] (__warn+0x104/0x108)
[<c0120734>] (__warn) from [<c01207b0>] (warn_slowpath_fmt+0x78/0xbc)
[<c01207b0>] (warn_slowpath_fmt) from [<c0898a54>]
(dev_watchdog+0x2e4/0x2e8)
[<c0898a54>] (dev_watchdog) from [<c019a888>] (call_timer_fn+0x3c/0x178)
[<c019a888>] (call_timer_fn) from [<c019bddc>]
(run_timer_softirq+0x540/0x624)
[<c019bddc>] (run_timer_softirq) from [<c0101298>]
(__do_softirq+0x130/0x3bc)
[<c0101298>] (__do_softirq) from [<c0127ea0>] (irq_exit+0xbc/0x100)
[<c0127ea0>] (irq_exit) from [<c017e514>] (handle_domain_irq+0x60/0x78)
[<c017e514>] (handle_domain_irq) from [<c05a6130>]
(gic_handle_irq+0x7c/0x90)
[<c05a6130>] (gic_handle_irq) from [<c0100afc>] (__irq_svc+0x5c/0x78)
Exception stack(0xc14f3f70 to 0xc14f3fb8)
3f60: 0003475c 00000000 00000001
c01188a0
3f80: c107b200 c0f06b4c c0f06b90 00000004 c1079ff8 c0c62774 00000000
00000000
3fa0: c10b2198 c14f3fc0 c0107fcc c0107fd0 60030013 ffffffff
[<c0100afc>] (__irq_svc) from [<c0107fd0>] (arch_cpu_idle+0x38/0x3c)
[<c0107fd0>] (arch_cpu_idle) from [<c0a57f38>]
(default_idle_call+0x3c/0xcc)
[<c0a57f38>] (default_idle_call) from [<c0157258>] (do_idle+0xdc/0x13c)
[<c0157258>] (do_idle) from [<c01575a4>] (cpu_startup_entry+0x18/0x1c)
[<c01575a4>] (cpu_startup_entry) from [<401015d0>] (0x401015d0)
---[ end trace a70942a1265338f1 ]---
rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
rcu: \x092-...0: (1 GPs behind) idle=75d/0/0x1 softirq=8288/8289 fqs=931
\x09(detected by 0, t=2102 jiffies, g=13485, q=1635)
Sending NMI from CPU 0 to CPUs 2:
spi_master spi2: spi2.1: timeout transferring 4 bytes@100000Hz for
110(100)ms
spidev spi2.1: SPI transfer failed: -110
Fixes: 492205050d77b (net: Add EMAC ethernet driver found on Allwinner A10 SoC's)
Signed-off-by: qianfan Zhao <[email protected]>
---
drivers/net/ethernet/allwinner/sun4i-emac.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index 49759deeed8e..034989b396f7 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -378,14 +378,11 @@ static int emac_set_mac_address(struct net_device *dev, void *p)
}
/* Initialize emac board */
-static void emac_init_device(struct net_device *dev)
+static void emac_init_device_no_lock(struct net_device *dev)
{
struct emac_board_info *db = netdev_priv(dev);
- unsigned long flags;
unsigned int reg_val;
- spin_lock_irqsave(&db->lock, flags);
-
emac_update_speed(dev);
emac_update_duplex(dev);
@@ -398,7 +395,15 @@ static void emac_init_device(struct net_device *dev)
reg_val = readl(db->membase + EMAC_INT_CTL_REG);
reg_val |= (0xf << 0) | (0x01 << 8);
writel(reg_val, db->membase + EMAC_INT_CTL_REG);
+}
+
+static void emac_init_device(struct net_device *dev)
+{
+ struct emac_board_info *db = netdev_priv(dev);
+ unsigned long flags;
+ spin_lock_irqsave(&db->lock, flags);
+ emac_init_device_no_lock(dev);
spin_unlock_irqrestore(&db->lock, flags);
}
@@ -416,7 +421,7 @@ static void emac_timeout(struct net_device *dev, unsigned int txqueue)
netif_stop_queue(dev);
emac_reset(db);
- emac_init_device(dev);
+ emac_init_device_no_lock(dev);
/* We can accept TX packets again */
netif_trans_update(dev);
netif_wake_queue(dev);
--
2.25.1
Dne ponedeljek, 12. september 2022 ob 08:33:31 CEST je [email protected]
napisal(a):
> From: qianfan Zhao <[email protected]>
>
> emac_timeout() callback acquires lock and so does emac_init_device(),
> which called inside lock protected region. This hangs the system and
> produces following warning:
>
> WARNING: CPU: 2 PID: 0 at net/sched/sch_generic.c:478
> dev_watchdog+0x2e4/0x2e8
> NETDEV WATCHDOG: FE0 (sun4i-emac): transmit queue 0 timed out
> Modules linked in:
> CPU: 2 PID: 0 Comm: swapper/2 Tainted: G W 5.15.0-00047-g0848e4aeb313
> Hardware name: Wisdom T3 based CCT Family
> [<c010f740>] (unwind_backtrace) from [<c010b744>] (show_stack+0x10/0x14)
> [<c010b744>] (show_stack) from [<c0a4d978>] (dump_stack_lvl+0x40/0x4c)
> [<c0a4d978>] (dump_stack_lvl) from [<c0120734>] (__warn+0x104/0x108)
> [<c0120734>] (__warn) from [<c01207b0>] (warn_slowpath_fmt+0x78/0xbc)
> [<c01207b0>] (warn_slowpath_fmt) from [<c0898a54>]
> (dev_watchdog+0x2e4/0x2e8)
> [<c0898a54>] (dev_watchdog) from [<c019a888>] (call_timer_fn+0x3c/0x178)
> [<c019a888>] (call_timer_fn) from [<c019bddc>]
> (run_timer_softirq+0x540/0x624)
> [<c019bddc>] (run_timer_softirq) from [<c0101298>]
> (__do_softirq+0x130/0x3bc)
> [<c0101298>] (__do_softirq) from [<c0127ea0>] (irq_exit+0xbc/0x100)
> [<c0127ea0>] (irq_exit) from [<c017e514>] (handle_domain_irq+0x60/0x78)
> [<c017e514>] (handle_domain_irq) from [<c05a6130>]
> (gic_handle_irq+0x7c/0x90)
> [<c05a6130>] (gic_handle_irq) from [<c0100afc>] (__irq_svc+0x5c/0x78)
> Exception stack(0xc14f3f70 to 0xc14f3fb8)
> 3f60: 0003475c 00000000 00000001
> c01188a0
> 3f80: c107b200 c0f06b4c c0f06b90 00000004 c1079ff8 c0c62774 00000000
> 00000000
> 3fa0: c10b2198 c14f3fc0 c0107fcc c0107fd0 60030013 ffffffff
> [<c0100afc>] (__irq_svc) from [<c0107fd0>] (arch_cpu_idle+0x38/0x3c)
> [<c0107fd0>] (arch_cpu_idle) from [<c0a57f38>]
> (default_idle_call+0x3c/0xcc)
> [<c0a57f38>] (default_idle_call) from [<c0157258>] (do_idle+0xdc/0x13c)
> [<c0157258>] (do_idle) from [<c01575a4>] (cpu_startup_entry+0x18/0x1c)
> [<c01575a4>] (cpu_startup_entry) from [<401015d0>] (0x401015d0)
> ---[ end trace a70942a1265338f1 ]---
> rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
> rcu: \x092-...0: (1 GPs behind) idle=75d/0/0x1 softirq=8288/8289 fqs=931
> \x09(detected by 0, t=2102 jiffies, g=13485, q=1635)
> Sending NMI from CPU 0 to CPUs 2:
> spi_master spi2: spi2.1: timeout transferring 4 bytes@100000Hz for
> 110(100)ms
> spidev spi2.1: SPI transfer failed: -110
>
> Fixes: 492205050d77b (net: Add EMAC ethernet driver found on Allwinner A10
> SoC's) Signed-off-by: qianfan Zhao <[email protected]>
Reviewed-by: Jernej Skrabec <[email protected]>
Best regards,
Jernej
From: qianfan Zhao <[email protected]>
History:
2022-09-12:
Introduce the first patch and can read it from:
https://lkml.kernel.org/lkml/[email protected]/
That was reviewed by Jernej Skrabec <[email protected]> but have not
marged.
2023-04-27:
Apply the first patch and I found the bug was not fully fixed.
I also get those error messages sometimes:
[ 108.581230] spi_master spi2: spi2.1: timeout transferring 1025 bytes@100000Hz for 190(164)ms
[ 108.590337] spidev spi2.1: SPI transfer failed: -110
[ 108.595443] spi_master spi2: failed to transfer one message from queue
...
I had tried `kdump` and `crash` tools but noting is useful.
Few days later I found `softirq` takes about 100% cpu of a cpu core, listen
softirq_entry, softirq_exit, net_dev_xmit events and I got those flood
messages:
289.902631: softirq_entry: vec=2 [action=NET_TX]
289.902651: net_dev_xmit: dev=eth0 skbaddr=(ptrval) len=98 rc=16
289.902656: softirq_exit: vec=2 [action=NET_TX]
289.902659: softirq_entry: vec=2 [action=NET_TX]
289.902664: net_dev_xmit: dev=eth0 skbaddr=(ptrval) len=98 rc=16
289.902668: softirq_exit: vec=2 [action=NET_TX]
...
And then I debug the linux kernel under qemu, make the emac-driver in qemu
drop some tx packages by this way:
diff --git a/hw/net/allwinner_emac.c b/hw/net/allwinner_emac.c
index 372e5b66da..28dfb1116b 100644
--- a/hw/net/allwinner_emac.c
+++ b/hw/net/allwinner_emac.c
@@ -349,9 +349,14 @@ static void aw_emac_write(void *opaque, hwaddr offset, uint64_t value,
"allwinner_emac: TX length > fifo data length\n");
}
if (len > 0) {
+ int ignore = random() % 10 < 1;
data = fifo8_pop_buf(fifo, len, &ret);
- qemu_send_packet(nc, data, ret);
+ if (!ignore)
+ qemu_send_packet(nc, data, ret);
aw_emac_tx_reset(s, chan);
+
+ if (ignore)
+ break;
/* Raise TX interrupt */
s->int_sta |= EMAC_INT_TX_CHAN(chan);
aw_emac_update_irq(s);
It's very easy to reproduce this bug now.
Next is the backtrace of gdb when softirq was raise again:
#0 __raise_softirq_irqoff (nr=nr@entry=2) at kernel/softirq.c:699
#1 raise_softirq_irqoff (nr=nr@entry=2) at kernel/softirq.c:671
#2 0xc0855a34 in __netif_reschedule (q=0xc2027c00) at net/core/dev.c:3041
#3 __netif_schedule (q=q@entry=0xc2027c00) at net/core/dev.c:3048
#4 0xc085b0ec in qdisc_run_end (qdisc=0xc2027c00) at ./include/net/sch_generic.h:227
#5 qdisc_run (q=0xc2027c00) at ./include/net/pkt_sched.h:133
#6 net_tx_action (h=<optimized out>) at net/core/dev.c:5046
#7 0xc0101298 in __do_softirq () at kernel/softirq.c:558
#8 0xc0127cd0 in run_ksoftirqd (cpu=<optimized out>) at kernel/softirq.c:920
#9 0xc01487d0 in smpboot_thread_fn (data=0xc14a2780) at kernel/smpboot.c:164
#10 0xc0144b58 in kthread (_create=0xc14a2800) at kernel/kthread.c:319
#11 0xc0100130 in ret_from_fork () at arch/arm/kernel/entry-common.S:146
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
`net_tx_action` is running in `__do_softirq` and it will send package when
`qdisc_run`. But the emac driver in linux alway return NETDEV_TX_BUSY(16)
after emac_timeout due to we forget reset `db->tx_fifo_stat`,
that will make `__netif_schedule` raise softirq again and again.
qianfan Zhao (2):
drivers: net: sun4i-emac: Fix double spinlock in emac_timeout
drivers: net: sun4i-emac: Fix emac_timeout
drivers/net/ethernet/allwinner/sun4i-emac.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
--
2.25.1
From: qianfan Zhao <[email protected]>
emac_timeout() callback acquires lock and so does emac_init_device(),
which called inside lock protected region. This hangs the system and
produces following warning:
WARNING: CPU: 2 PID: 0 at net/sched/sch_generic.c:478
dev_watchdog+0x2e4/0x2e8
NETDEV WATCHDOG: FE0 (sun4i-emac): transmit queue 0 timed out
Modules linked in:
CPU: 2 PID: 0 Comm: swapper/2 Tainted: G W 5.15.0-00047-g0848e4aeb313
Hardware name: Wisdom T3 based CCT Family
[<c010f740>] (unwind_backtrace) from [<c010b744>] (show_stack+0x10/0x14)
[<c010b744>] (show_stack) from [<c0a4d978>] (dump_stack_lvl+0x40/0x4c)
[<c0a4d978>] (dump_stack_lvl) from [<c0120734>] (__warn+0x104/0x108)
[<c0120734>] (__warn) from [<c01207b0>] (warn_slowpath_fmt+0x78/0xbc)
[<c01207b0>] (warn_slowpath_fmt) from [<c0898a54>]
(dev_watchdog+0x2e4/0x2e8)
[<c0898a54>] (dev_watchdog) from [<c019a888>] (call_timer_fn+0x3c/0x178)
[<c019a888>] (call_timer_fn) from [<c019bddc>]
(run_timer_softirq+0x540/0x624)
[<c019bddc>] (run_timer_softirq) from [<c0101298>]
(__do_softirq+0x130/0x3bc)
[<c0101298>] (__do_softirq) from [<c0127ea0>] (irq_exit+0xbc/0x100)
[<c0127ea0>] (irq_exit) from [<c017e514>] (handle_domain_irq+0x60/0x78)
[<c017e514>] (handle_domain_irq) from [<c05a6130>]
(gic_handle_irq+0x7c/0x90)
[<c05a6130>] (gic_handle_irq) from [<c0100afc>] (__irq_svc+0x5c/0x78)
Exception stack(0xc14f3f70 to 0xc14f3fb8)
3f60: 0003475c 00000000 00000001
c01188a0
3f80: c107b200 c0f06b4c c0f06b90 00000004 c1079ff8 c0c62774 00000000
00000000
3fa0: c10b2198 c14f3fc0 c0107fcc c0107fd0 60030013 ffffffff
[<c0100afc>] (__irq_svc) from [<c0107fd0>] (arch_cpu_idle+0x38/0x3c)
[<c0107fd0>] (arch_cpu_idle) from [<c0a57f38>]
(default_idle_call+0x3c/0xcc)
[<c0a57f38>] (default_idle_call) from [<c0157258>] (do_idle+0xdc/0x13c)
[<c0157258>] (do_idle) from [<c01575a4>] (cpu_startup_entry+0x18/0x1c)
[<c01575a4>] (cpu_startup_entry) from [<401015d0>] (0x401015d0)
---[ end trace a70942a1265338f1 ]---
rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
rcu: \x092-...0: (1 GPs behind) idle=75d/0/0x1 softirq=8288/8289 fqs=931
\x09(detected by 0, t=2102 jiffies, g=13485, q=1635)
Sending NMI from CPU 0 to CPUs 2:
spi_master spi2: spi2.1: timeout transferring 4 bytes@100000Hz for
110(100)ms
spidev spi2.1: SPI transfer failed: -110
Fixes: 492205050d77b (net: Add EMAC ethernet driver found on Allwinner A10 SoC's)
Signed-off-by: qianfan Zhao <[email protected]>
---
drivers/net/ethernet/allwinner/sun4i-emac.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index a94c62956eed..8093d6d2101c 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -488,14 +488,11 @@ static int emac_set_mac_address(struct net_device *dev, void *p)
}
/* Initialize emac board */
-static void emac_init_device(struct net_device *dev)
+static void emac_init_device_no_lock(struct net_device *dev)
{
struct emac_board_info *db = netdev_priv(dev);
- unsigned long flags;
unsigned int reg_val;
- spin_lock_irqsave(&db->lock, flags);
-
emac_update_speed(dev);
emac_update_duplex(dev);
@@ -508,7 +505,15 @@ static void emac_init_device(struct net_device *dev)
reg_val = readl(db->membase + EMAC_INT_CTL_REG);
reg_val |= (EMAC_INT_CTL_TX_EN | EMAC_INT_CTL_TX_ABRT_EN | EMAC_INT_CTL_RX_EN);
writel(reg_val, db->membase + EMAC_INT_CTL_REG);
+}
+
+static void emac_init_device(struct net_device *dev)
+{
+ struct emac_board_info *db = netdev_priv(dev);
+ unsigned long flags;
+ spin_lock_irqsave(&db->lock, flags);
+ emac_init_device_no_lock(dev);
spin_unlock_irqrestore(&db->lock, flags);
}
@@ -526,7 +531,7 @@ static void emac_timeout(struct net_device *dev, unsigned int txqueue)
netif_stop_queue(dev);
emac_reset(db);
- emac_init_device(dev);
+ emac_init_device_no_lock(dev);
/* We can accept TX packets again */
netif_trans_update(dev);
netif_wake_queue(dev);
--
2.25.1
From: qianfan Zhao <[email protected]>
The system will hang in softirq after emac_timeout happens, next is the
trace logs:
289.902631: softirq_entry: vec=2 [action=NET_TX]
289.902651: net_dev_xmit: dev=eth0 skbaddr=(ptrval) len=98 rc=16
289.902656: softirq_exit: vec=2 [action=NET_TX]
289.902659: softirq_entry: vec=2 [action=NET_TX]
289.902664: net_dev_xmit: dev=eth0 skbaddr=(ptrval) len=98 rc=16
289.902668: softirq_exit: vec=2 [action=NET_TX]
...
Clear tx_fifo_stat to free all tx channels after emac reset done to fix.
Fixes: 492205050d77b (net: Add EMAC ethernet driver found on Allwinner A10 SoC's)
Signed-off-by: qianfan Zhao <[email protected]>
---
drivers/net/ethernet/allwinner/sun4i-emac.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index 8093d6d2101c..964c5e7188f8 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -536,6 +536,9 @@ static void emac_timeout(struct net_device *dev, unsigned int txqueue)
netif_trans_update(dev);
netif_wake_queue(dev);
+ /* Clear tx_fifo_stat to free all tx channels */
+ db->tx_fifo_stat = 0;
+
/* Restore previous register address */
spin_unlock_irqrestore(&db->lock, flags);
}
--
2.25.1