Also reported here a few times, I've been seeing this bug every time I
try to load up the ath5k module in my kernel:
http://www.kerneloops.org/search.php?search=ieee80211_register_hw&btnG=Function+Search
If you guys have any suggestions, I'd love to hear them. I
disassembled the code in question but am not very good with these
things. It looks to be somewhere between lines 804-825 in
net/mac80211/main.c.
-Dan
ath5k_pci 0000:01:00.0: PCI INT A -> GSI 18 (level, low) -> IRQ 18
ath5k_pci 0000:01:00.0: setting latency timer to 64
ath5k_pci 0000:01:00.0: registered as ''
BUG: unable to handle kernel NULL pointer dereference at 00000000
IP: [<b80c1252>] :mac80211:ieee80211_register_hw+0x10f/0x2d6
*pde = 00000000
Oops: 0000 [#1] PREEMPT
Modules linked in: ath5k(+) mac80211
Pid: 818, comm: modprobe Not tainted (2.6.27.6eee #3)
EIP: 0060:[<b80c1252>] EFLAGS: 00010286 CPU: 0
EIP is at ieee80211_register_hw+0x10f/0x2d6 [mac80211]
EAX: 00000000 EBX: b7345000 ECX: 00000001 EDX: 00000001
ESI: b681c180 EDI: 00000000 EBP: b7183000 ESP: b7365e00
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
Process modprobe (pid: 818, ti=b7364000 task=b71f7130 task.ti=b7364000)
Stack: 00000000 00000000 b681cd00 b681cd00 b681fe06 b80ea639 b7365e70 b681cd00
00000005 b7064458 b7365e70 b7064400 781689c9 b7007000 b8060000 b681cd00
b681c180 00000000 b681fde4 b7183000 0c0c45e9 0000001a b681cde8 b7183000
Call Trace:
[<b80ea639>] ath5k_pci_probe+0xc27/0x1150 [ath5k]
[<781689c9>] find_inode+0x1b/0x56
[<781e0b0e>] pci_device_probe+0x36/0x55
[<78252342>] driver_probe_device+0xa1/0x132
[<7825240a>] __driver_attach+0x37/0x55
[<78251d86>] bus_for_each_dev+0x35/0x5c
[<782521f1>] driver_attach+0x11/0x13
[<782523d3>] __driver_attach+0x0/0x55
[<7825184b>] bus_add_driver+0x91/0x1a7
[<b802d000>] init_ath5k_pci+0x0/0x2f [ath5k]
[<78252571>] driver_register+0x7d/0xd6
[<b802d000>] init_ath5k_pci+0x0/0x2f [ath5k]
[<781e0d11>] __pci_register_driver+0x35/0x60
[<b802d01a>] init_ath5k_pci+0x1a/0x2f [ath5k]
[<7810111f>] _stext+0x37/0xfb
[<78137d76>] sys_init_module+0x87/0x175
[<78102de9>] sysenter_do_call+0x12/0x25
=======================
Code: 83 c8 00 00 00 21 03 c7 83 b4 00 00 00 1c 49 0d b8 c7 83 0c 02
00 00 ee 17 0c b8 8b 46 1c 8b 40 7c 8b 80 bc 00 00 00 6a 00 6a 00 <8b>
00 e8 75 64 06 c0 5f bf f4 ff ff ff 85 c0 89 46 20 5a 0f 84
EIP: [<b80c1252>] ieee80211_register_hw+0x10f/0x2d6 [mac80211] SS:ESP
0068:b7365e00
---[ end trace 0b0fad82f83782b1 ]---
GDB disassembly:
0x0000028f <ieee80211_register_hw+296>: cmpl $0xd,0x30(%esi)
0x00000293 <ieee80211_register_hw+300>: mov $0xd,%eax
0x00000298 <ieee80211_register_hw+305>: cmovae 0x30(%esi),%eax
0x0000029c <ieee80211_register_hw+309>: cmpl $0x9,(%esi)
0x0000029f <ieee80211_register_hw+312>: mov %eax,0x9c(%esi)
0x000002a5 <ieee80211_register_hw+318>: jg 0x2ad <ieee80211_register_hw+326>
0x000002a7 <ieee80211_register_hw+320>: movl $0x64,(%esi)
0x000002ad <ieee80211_register_hw+326>: cmpw $0x0,0x44(%esi)
0x000002b2 <ieee80211_register_hw+331>: jne 0x2ba <ieee80211_register_hw+339>
0x000002b4 <ieee80211_register_hw+333>: movw $0x1,0x44(%esi)
0x000002ba <ieee80211_register_hw+339>: mov 0x2c(%esi),%ecx
0x000002bd <ieee80211_register_hw+342>: mov 0x44(%esi),%eax
0x000002c0 <ieee80211_register_hw+345>: mov %ecx,%edx
0x000002c2 <ieee80211_register_hw+347>: mov %ax,0xc(%esi)
---Type <return> to continue, or q <return> to quit---
0x000002c6 <ieee80211_register_hw+351>: mov %cl,%al
0x000002c8 <ieee80211_register_hw+353>: and $0xe0,%al
0x000002ca <ieee80211_register_hw+355>: cmp $0x1,%al
0x000002cc <ieee80211_register_hw+357>: sbb %eax,%eax
0x000002ce <ieee80211_register_hw+359>: and $0x100,%edx
0x000002d4 <ieee80211_register_hw+365>: and $0xf,%al
0x000002d6 <ieee80211_register_hw+367>: inc %al
^^^ HERE ^^^
0x000002d8 <ieee80211_register_hw+369>: or 0x98(%esi),%al
0x000002de <ieee80211_register_hw+375>: cmp $0x1,%edx
0x000002e1 <ieee80211_register_hw+378>: sbb %edx,%edx
0x000002e3 <ieee80211_register_hw+380>: and $0x3c,%dl
0x000002e6 <ieee80211_register_hw+383>: add $0x4,%dl
0x000002e9 <ieee80211_register_hw+386>: or %al,%dl
0x000002eb <ieee80211_register_hw+388>: test %cl,%cl
0x000002ed <ieee80211_register_hw+390>: mov %dl,0x98(%esi)
0x000002f3 <ieee80211_register_hw+396>: jns 0x2fe <ieee80211_register_hw+407>
0x000002f5 <ieee80211_register_hw+398>: or $0x8,%dl
0x000002f8 <ieee80211_register_hw+401>: mov %dl,0x98(%esi)
0x000002fe <ieee80211_register_hw+407>: mov %esi,%eax
0x00000300 <ieee80211_register_hw+409>: call 0x301 <ieee80211_register_hw+410>
0x00000305 <ieee80211_register_hw+414>: test %eax,%eax
0x00000307 <ieee80211_register_hw+416>: mov %eax,%edi
0x00000309 <ieee80211_register_hw+418>: js 0x41b <ieee80211_register_hw+692>
0x0000030f <ieee80211_register_hw+424>: call 0x310 <ieee80211_register_hw+425>
0x00000314 <ieee80211_register_hw+429>: mov 0x54(%esi),%eax
0x00000317 <ieee80211_register_hw+432>: mov %eax,%edx
GCC assembly compile:
call ieee80211_rx_bss_list_init
movl $1, %edx
movl $1, %ecx
movl $ieee80211_master_start_xmit, 412(%ebx)
movl $ieee80211_master_open, 508(%ebx)
movl $ieee80211_master_stop, 512(%ebx)
movw $801, 200(%ebx)
movl $ieee80211_header_ops, 180(%ebx)
movl $ieee80211_master_set_multicast_list, 524(%ebx)
movl 28(%esi), %eax
movl 124(%eax), %eax
movl 188(%eax), %eax
pushl $0
pushl $0
movl (%eax), %eax
call __create_workqueue_key
popl %edi
movl $-12, %edi
testl %eax, %eax
movl %eax, 32(%esi)
popl %edx
je .L34
cmpl $13, 48(%esi)
movl $13, %eax
cmovae 48(%esi), %eax
cmpl $9, (%esi)
movl %eax, 156(%esi)
jg .L35
movl $100, (%esi)
.L35:
cmpw $0, 68(%esi)
jne .L36
movw $1, 68(%esi)
.L36:
movl 44(%esi), %ecx
movl 68(%esi), %eax
movl %ecx, %edx
movw %ax, 12(%esi)
movb %cl, %al
andb $-32, %al
cmpb $1, %al
sbbl %eax, %eax
andl $256, %edx
andb $15, %al
incb %al
^^^ HERE ^^^
orb 152(%esi), %al
cmpl $1, %edx
sbbl %edx, %edx
andb $60, %dl
addb $4, %dl
orb %al, %dl
testb %cl, %cl
movb %dl, 152(%esi)
jns .L41
orb $8, %dl
movb %dl, 152(%esi)
.L41:
movl %esi, %eax
call sta_info_start
testl %eax, %eax
movl %eax, %edi
js .L42
call rtnl_lock
movl 84(%esi), %eax
movl %eax, %edx
call dev_alloc_name
testl %eax, %eax
movl %eax, %edi
js .L43
movl 28(%esi), %edx
movl 84(%esi), %ecx
movl (%edx), %eax
movl %eax, 308(%ecx)
movw 4(%edx), %ax
movw %ax, 312(%ecx)
movl 28(%esi), %eax
movl 84(%esi), %edx
movl 124(%eax), %eax
movl %eax, 688(%edx)
movl 84(%esi), %eax
call register_netdevice
testl %eax, %eax
movl %eax, %edi
js .L43
movl 36(%esi), %edx
movl %esi, %eax
call ieee80211_init_rate_ctrl_alg
On Sat, Nov 15, 2008 at 08:38:32PM -0600, Dan McGee wrote:
> I should clarify here- the real bug was when trying to access
> '->driver', as we got the 00000001 poison pointer returned (this is a
> poison value, right?).
Not sure it's a poison, but it's definitely a bad value. The good news
is that with your config and compat-wireless I can easily reproduce this
now. So hopefully I'll be able to pin something down soon.
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 1:17 AM, Luis R. Rodriguez <[email protected]> wrote:
> If our offsets are the same then its probably on line 791:
[...]
> 790 name = wiphy_dev(local->hw.wiphy)->driver->name;
> 791 local->hw.workqueue = create_freezeable_workqueue(name);
I agree, having looked at the objdump output. Hmm, maybe ->driver pointer
is bad even though I can't see that happening. Dan, can you try adding a
printk before line 790 to see if any of the pointers are null?
Also, do you have CONFIG_LOCKDEP enabled or no?
--
Bob Copeland %% http://www.bobcopeland.com
On Sun, Nov 16, 2008 at 10:20 AM, Bob Copeland <[email protected]> wrote:
> As you can see, the modules think "phy0" should be at offset 164, but
> it's really at 160.
> Also I noticed the original config had CONFIG_DYNAMIC_FTRACE=y, unfortunately
> turning that off didn't seem to change anything (hmm, I didn't get a build
> bug, was the breakage fixed in 2.6.27.y?)
Note that HAVE is set if the architecture supports it; CONFIG_FTRACE
itself was turned off (and CONFIG_DYNAMIC_FTRACE is not even present):
$ grep FTRACE .config
CONFIG_HAVE_FTRACE=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
# CONFIG_FTRACE is not set
-Dan
On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
> Luis R. Rodriguez wrote:
>> > From the original log:
>> > ath5k_pci 0000:01:00.0: registered as ''
>>
>> That comes from
>>
>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>
>> which means that may be buggy too as wiphy_name() gets
>>
>> return wiphy->dev.bus_id;
>>
>> and we only have set at that point wiphy->dev.parent IIRC.
>>
>
> Nope, parent is set after that call in SET_IEEE80211_DEV()
> phy[x] gets assigned in alloc_hw().
>
> So why is it NULL even if alloc_hw() succeeds ?
Parent is wiphy->dev.parent not wiphy->dev though
Luis
On Fri, Nov 14, 2008 at 1:06 PM, Luis R. Rodriguez <[email protected]> wrote:
> Wow.. ok, yeah I see that, this is a mac8021 bug then unless mac80211
> wants us to not callieee80211_register_hw() from probe or its children
> which would be weird.
>
> So we need to fix that.
Err, wait, I'm wrong. That's pci_dev->driver, not pci_dev->dev->driver.
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 6:36 PM, Luis R. Rodriguez
<[email protected]> wrote:
> On Fri, Nov 14, 2008 at 04:25:53PM -0800, Dan McGee wrote:
>> On Fri, Nov 14, 2008 at 3:13 PM, Luis R. Rodriguez <[email protected]> wrote:
>> > On Fri, Nov 14, 2008 at 12:41 PM, Luis R. Rodriguez <[email protected]> wrote:
>> >> On Fri, Nov 14, 2008 at 12:33 PM, Bob Copeland <[email protected]> wrote:
>> >>> On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
>> >>>> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>> >>>>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>> >>>>>> Luis R. Rodriguez wrote:
>> >>>>>>> > From the original log:
>> >>>>>>> > ath5k_pci 0000:01:00.0: registered as ''
>> >>>>>>>
>> >>>>>>> That comes from
>> >>>>>>>
>> >>>>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>> >>>>>>>
>> >>>>>>> which means that may be buggy too as wiphy_name() gets
>> >>>>>>>
>> >>>>>>> return wiphy->dev.bus_id;
>> >>>>>>>
>> >>>>>>> and we only have set at that point wiphy->dev.parent IIRC.
>> >>>>>>>
>> >>>>>>
>> >>>>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>> >>>>>> phy[x] gets assigned in alloc_hw().
>> >>>>>>
>> >>>>>> So why is it NULL even if alloc_hw() succeeds ?
>> >>>>>
>> >>>>> Parent is wiphy->dev.parent not wiphy->dev though
>> >>>
>> >>> Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
>> >>> since wiphy_new in alloc_hw fills it in.
>> >>
>> >> Ah, I see, yes, oh man. Well this is good :)
>> >
>> > OK then try the patch in the other thread. If that doesn't fix it then
>> > it has to be another issue.
>>
>> Can someone point me to the "other thread"? I don't think I was CCed
>> on it since I came late to the party.
>>
>> Someone also inquired about versions. I'm using stable kernel 2.6.27.6
>> and compat-wireless-2008-11-13 (10-28 was doing the same thing
>> however).
>
> Bounced you a copy. Please give it a try.
I gave it a shot. It worked (with some conditions attached). At first
I built with just the patch, not enabling kernel debugging or
anything, and I got a different NPE in some sysfs registration code
deeper down in what I think was still the register_hw method. I can't
fully remember, because it wasn't an easily savable stack trace and I
immediately decided I would just rebuild everything with debug
symbols.
Of course, now that I have a kernel with full debug symbols, I can't
get it to oops. Figures. I'll keep you updated if I see this error
again.
-Dan
On Sat, Nov 15, 2008 at 10:15:31PM -0800, Luis R. Rodriguez wrote:
> The answer is drivers/base/dd.c really_probe(). Curious enough guess
> what, dev->driver = drv; is assigned *before* the device driver probe
> :) (bus probe in this case first, so dev->bus->probe) contrary to the
> other way around. So you're absolutely right the patch is not
> necessary then. Good catch.
Cool, so that's where that is.
I got some more info, this looks like a miscompilation of some kind,
maybe a mismatch between kernel config and compat-wireless, but I didn't
see anything obvious.
Here's what I did:
--- drivers/net/wireless/ath5k/base.c 2008-11-14 00:11:09.000000000 -0500
+++ drivers/net/wireless/ath5k/my-base.c 2008-11-16 11:01:42.000000000 -0500
@@ -490,6 +490,9 @@
goto err_map;
}
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_OFFSET, 16, 1,
+ hw->wiphy, sizeof(struct wiphy), 1);
+ printk(KERN_INFO "offset: %d\n", offsetof(struct wiphy, dev.bus_id));
dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
/* Initialize driver private data */
Which produced:
ath5k_pci 0000:02:00.0: PCI INT A -> GSI 17 (level, low) -> IRQ 17
ath5k_pci 0000:02:00.0: setting latency timer to 64
00000000: 00 00 00 00 00 00 00 00 b4 ea 07 b7 00 00 00 00 ................
00000010: 00 00 00 00 54 c0 03 b3 54 c0 03 b3 9a c0 24 78 ....T...T.....$x
00000020: 58 c0 24 78 00 00 00 00 00 00 00 00 00 00 00 00 X.$x............
00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000050: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000080: c0 c0 03 b3 c0 c0 03 b3 00 00 00 00 40 b2 c0 b5 ............@...
00000090: 28 cf 42 78 00 00 00 00 01 00 00 00 01 00 00 00 (.Bx............
000000a0: 70 68 79 30 00 00 00 00 00 00 00 00 00 00 00 00 phy0............
000000b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
000000c0: 01 00 00 00 04 c1 03 b3 04 c1 03 b3 00 00 00 00 ................
000000d0: 00 00 00 00 00 00 00 00 00 c0 03 b3 00 00 00 00 ................
000000e0: 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................
000000f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000100: 40 c1 03 b3 40 c1 03 b3 00 00 00 00 00 00 00 00 @...@...........
00000110: 50 c1 03 b3 50 c1 03 b3 58 c1 03 b3 58 c1 03 b3 P...P...X...X...
00000120: 2c 0f 45 78 00 00 00 00 00 00 00 00 00 00 00 00 ,.Ex............
00000130: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
offset: 164
ath5k_pci 0000:02:00.0: registered as ''
BUG: unable to handle kernel NULL pointer dereference at 00000000
As you can see, the modules think "phy0" should be at offset 164, but
it's really at 160.
Also I noticed the original config had CONFIG_DYNAMIC_FTRACE=y, unfortunately
turning that off didn't seem to change anything (hmm, I didn't get a build
bug, was the breakage fixed in 2.6.27.y?)
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>>> Luis R. Rodriguez wrote:
>>>> > From the original log:
>>>> > ath5k_pci 0000:01:00.0: registered as ''
>>>>
>>>> That comes from
>>>>
>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>>>
>>>> which means that may be buggy too as wiphy_name() gets
>>>>
>>>> return wiphy->dev.bus_id;
>>>>
>>>> and we only have set at that point wiphy->dev.parent IIRC.
>>>>
>>>
>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>>> phy[x] gets assigned in alloc_hw().
>>>
>>> So why is it NULL even if alloc_hw() succeeds ?
>>
>> Parent is wiphy->dev.parent not wiphy->dev though
Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
since wiphy_new in alloc_hw fills it in. Curiouser and curiouser. I look
forward to this being an obvious and embarrassing bug :)
> So I'm at work for another 4 or 5 hours, but I should have a chance to
> hack at anything tonight you guys want me to. Let me know if the
> earlier suggestions in this thread still apply.
I think the printks will still be useful, if only to verify we're on
the right track. Which compat-wireless version are you using?
BTW from kerneloops, several people are having this with other drivers than
ath5k.
--
Bob Copeland %% http://www.bobcopeland.com
On Sun, Nov 16, 2008 at 10:52 AM, Bob Copeland <[email protected]> wrote:
> On Sun, Nov 16, 2008 at 10:38:39AM -0600, Dan McGee wrote:
>> On Sun, Nov 16, 2008 at 10:20 AM, Bob Copeland <[email protected]> wrote:
>> > As you can see, the modules think "phy0" should be at offset 164, but
>> > it's really at 160.
>>
>> > Also I noticed the original config had CONFIG_DYNAMIC_FTRACE=y, unfortunately
>> > turning that off didn't seem to change anything (hmm, I didn't get a build
>> > bug, was the breakage fixed in 2.6.27.y?)
>>
>> Note that HAVE is set if the architecture supports it; CONFIG_FTRACE
>> itself was turned off (and CONFIG_DYNAMIC_FTRACE is not even present):
>> $ grep FTRACE .config
>> CONFIG_HAVE_FTRACE=y
>> CONFIG_HAVE_DYNAMIC_FTRACE=y
>> # CONFIG_FTRACE is not set
>
> Oh, you are right. Thanks. I probably won't be able to do any more
> testing today but I would suggest trying to turn off wireless-related
> options in the main kernel config and see if you can get it working
> (start with cfg80211 and its dependencies...)
The fun continues. If I rebuild the kernel with all wireless options
turned off except WIRELESS_EXT, then build compat-wireless, our module
appears to load without issues.
.config:
#
# Wireless
#
# CONFIG_CFG80211 is not set
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
# CONFIG_MAC80211 is not set
# CONFIG_IEEE80211 is not set
# CONFIG_RFKILL is not set
# CONFIG_NET_9P is not set
dmesg:
atl2: eth0 NIC Link is Up<100 Mbps Full Duplex>
evdev.c(EVIOCGBIT): Suspicious buffer size 511, limiting output to 64
bytes. See http://userweb.kernel.org/~dtor/eviocgbit-bug.html
<<< boot stopped here, then I modprobed ath5k >>>
cfg80211: Using static regulatory domain info
cfg80211: Regulatory domain: US
(start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp)
(2402000 KHz - 2472000 KHz @ 40000 KHz), (600 mBi, 2700 mBm)
(5170000 KHz - 5190000 KHz @ 40000 KHz), (600 mBi, 2300 mBm)
(5190000 KHz - 5210000 KHz @ 40000 KHz), (600 mBi, 2300 mBm)
(5210000 KHz - 5230000 KHz @ 40000 KHz), (600 mBi, 2300 mBm)
(5230000 KHz - 5330000 KHz @ 40000 KHz), (600 mBi, 2300 mBm)
(5735000 KHz - 5835000 KHz @ 40000 KHz), (600 mBi, 3000 mBm)
cfg80211: Calling CRDA for country: US
ath5k_pci 0000:01:00.0: PCI INT A -> GSI 18 (level, low) -> IRQ 18
ath5k_pci 0000:01:00.0: setting latency timer to 64
ath5k_pci 0000:01:00.0: registered as 'phy0'
phy0: Selected rate control algorithm 'pid'
ath5k phy0: Atheros AR2425 chip found (MAC: 0xe2, PHY: 0x70)
wlan0: authenticate with AP 00:1c:10:21:f5:f4
wlan0: authenticated
wlan0: associate with AP 00:1c:10:21:f5:f4
wlan0: RX AssocResp from 00:1c:10:21:f5:f4 (capab=0x411 status=0 aid=3)
wlan0: associated
I had not seen this cfg80211 stuff before when modprobing. I also see
that cfg80211 is now loaded as a module- was the fact that it was
built into the kernel before causing all these problems? That would be
my strong suspicion, in which case this entire thread is really a bum
bug report, except that compat-wireless should enforce that cfg80211
is a module just like it currently enforces that for mac80211, saving
someone else this headache.
-Dan
On Fri, Nov 14, 2008 at 10:13 AM, Sujith <[email protected]> wrote:
> Luis R. Rodriguez wrote:
>> On Fri, Nov 14, 2008 at 9:49 AM, Bob Copeland <[email protected]> wrote:
>> > On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>> >> Anyway we do set the parent before calling ieee80211_register_hw() by
>> >> using SET_IEEE80211_DEV(hw, &pdev->dev) (oh ok here is the parent). so
>> >> when we try to get the name with parent->driver I am not sure if
>> >> driver has been set yet because ath5k_pci_probe() hasn't finished yet
>> >> as probe called ath5k_hw_attach(). When does ->driver get set and why
>> >> would it fail only for ath5k?
>> >
>> > Ahh, I believe that would be in pci-driver.c __pci_device_probe, after driver
>> > probe is run successfully. Yes, we do ieee80211_register_hw from within
>> > probe... So, how did this ever work again? /me scratches head.
>>
>> Wow.. ok, yeah I see that, this is a mac8021 bug then unless mac80211
>> wants us to not callieee80211_register_hw() from probe or its children
>> which would be weird.
>
> From the original log:
> ath5k_pci 0000:01:00.0: registered as ''
That comes from
dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
which means that may be buggy too as wiphy_name() gets
return wiphy->dev.bus_id;
and we only have set at that point wiphy->dev.parent IIRC.
Luis
Luis R. Rodriguez wrote:
> On Fri, Nov 14, 2008 at 9:49 AM, Bob Copeland <[email protected]> wrote:
> > On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
> >> Anyway we do set the parent before calling ieee80211_register_hw() by
> >> using SET_IEEE80211_DEV(hw, &pdev->dev) (oh ok here is the parent). so
> >> when we try to get the name with parent->driver I am not sure if
> >> driver has been set yet because ath5k_pci_probe() hasn't finished yet
> >> as probe called ath5k_hw_attach(). When does ->driver get set and why
> >> would it fail only for ath5k?
> >
> > Ahh, I believe that would be in pci-driver.c __pci_device_probe, after driver
> > probe is run successfully. Yes, we do ieee80211_register_hw from within
> > probe... So, how did this ever work again? /me scratches head.
>
> Wow.. ok, yeah I see that, this is a mac8021 bug then unless mac80211
> wants us to not callieee80211_register_hw() from probe or its children
> which would be weird.
>From the original log:
ath5k_pci 0000:01:00.0: registered as ''
Sujith
On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
> Anyway we do set the parent before calling ieee80211_register_hw() by
> using SET_IEEE80211_DEV(hw, &pdev->dev) (oh ok here is the parent). so
> when we try to get the name with parent->driver I am not sure if
> driver has been set yet because ath5k_pci_probe() hasn't finished yet
> as probe called ath5k_hw_attach(). When does ->driver get set and why
> would it fail only for ath5k?
Ahh, I believe that would be in pci-driver.c __pci_device_probe, after driver
probe is run successfully. Yes, we do ieee80211_register_hw from within
probe... So, how did this ever work again? /me scratches head.
--
Bob Copeland %% http://www.bobcopeland.com
On Sun, 2008-11-16 at 12:01 -0600, Dan McGee wrote:
> I had not seen this cfg80211 stuff before when modprobing. I also see
> that cfg80211 is now loaded as a module- was the fact that it was
> built into the kernel before causing all these problems? That would be
> my strong suspicion, in which case this entire thread is really a bum
> bug report, except that compat-wireless should enforce that cfg80211
> is a module just like it currently enforces that for mac80211, saving
> someone else this headache.
Let me guess, symbol versioning was off?
johannes
On Fri, Nov 14, 2008 at 11:02 AM, Bob Copeland <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 1:17 AM, Luis R. Rodriguez <[email protected]> wrote:
>> If our offsets are the same then its probably on line 791:
> [...]
>> 790 name = wiphy_dev(local->hw.wiphy)->driver->name;
>> 791 local->hw.workqueue = create_freezeable_workqueue(name);
>
> I agree, having looked at the objdump output. Hmm, maybe ->driver pointer
> is bad even though I can't see that happening. Dan, can you try adding a
> printk before line 790 to see if any of the pointers are null?
So I went back and added a few things to the original unpatched code
to see what was NULL pointering, just to be sure we were thinking
right. Here is the relevant code:
printk(KERN_DEBUG "wiphy_dev() : %p\n", wiphy_dev(local->hw.wiphy));
printk(KERN_DEBUG "driver : %p\n",
wiphy_dev(local->hw.wiphy)->driver);
printk(KERN_DEBUG "driver->name: %p\n",
wiphy_dev(local->hw.wiphy)->driver->name);
name = wiphy_dev(local->hw.wiphy)->driver->name;
local->hw.workqueue = create_freezeable_workqueue(name);
And the dmesg output:
ath5k_pci xxx: registered as ''
wiphy_dev() : b730b408
driver : 00000001
BUG: unalbe to handle kernel NULL pointer dereference at 00000001
So we bugged out on trying to print driver->name, which is the same
problem we would have hit in the 'name =' line.
-Dan
Luis R. Rodriguez wrote:
> > From the original log:
> > ath5k_pci 0000:01:00.0: registered as ''
>
> That comes from
>
> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>
> which means that may be buggy too as wiphy_name() gets
>
> return wiphy->dev.bus_id;
>
> and we only have set at that point wiphy->dev.parent IIRC.
>
Nope, parent is set after that call in SET_IEEE80211_DEV()
phy[x] gets assigned in alloc_hw().
So why is it NULL even if alloc_hw() succeeds ?
Sujith
On Sat, Nov 15, 2008 at 8:11 PM, Dan McGee <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 11:02 AM, Bob Copeland <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 1:17 AM, Luis R. Rodriguez <[email protected]> wrote:
>>> If our offsets are the same then its probably on line 791:
>> [...]
>>> 790 name = wiphy_dev(local->hw.wiphy)->driver->name;
>>> 791 local->hw.workqueue = create_freezeable_workqueue(name);
>>
>> I agree, having looked at the objdump output. Hmm, maybe ->driver pointer
>> is bad even though I can't see that happening. Dan, can you try adding a
>> printk before line 790 to see if any of the pointers are null?
>
> So I went back and added a few things to the original unpatched code
> to see what was NULL pointering, just to be sure we were thinking
> right. Here is the relevant code:
> printk(KERN_DEBUG "wiphy_dev() : %p\n", wiphy_dev(local->hw.wiphy));
> printk(KERN_DEBUG "driver : %p\n",
> wiphy_dev(local->hw.wiphy)->driver);
> printk(KERN_DEBUG "driver->name: %p\n",
> wiphy_dev(local->hw.wiphy)->driver->name);
> name = wiphy_dev(local->hw.wiphy)->driver->name;
> local->hw.workqueue = create_freezeable_workqueue(name);
>
> And the dmesg output:
> ath5k_pci xxx: registered as ''
> wiphy_dev() : b730b408
> driver : 00000001
> BUG: unalbe to handle kernel NULL pointer dereference at 00000001
>
> So we bugged out on trying to print driver->name, which is the same
> problem we would have hit in the 'name =' line.
I should clarify here- the real bug was when trying to access
'->driver', as we got the 00000001 poison pointer returned (this is a
poison value, right?).
The above sequence of events was what took place when trying to load
the module on startup. To see if other things had an effect, I
disabled module autoloading during the boot sequence and got slightly
different results but it looks to be the same type of problem:
registered as ''
wiphy_dev: b730d740
driver: 7fffffff
driver->name: ffffffff
BUG: unable to handle kernel paging request at ffffffff
One more note- booting with the 2.6.27.6 shipped wireless modules
(mac80211 and ath5k) has always been working fine. It is only when I
try to run compat-wireless on top of this kernel that we are seeing
issues. Theoretically that means this should be bisectable if we
really can't figure it out, but I'm not sure how practical that is.
-Dan
On Fri, Nov 14, 2008 at 8:57 PM, Dan McGee <[email protected]> wrote:
>
> BUG: unable to handle kernel NULL pointer dereference at 00000082
> IP: [<7818ca71>] sysfs_find_dirent+0x9/0x23
> Oops: 0000 [#1] PREEMPT
> Modules linked in: ath5k(+) mac80211
>
> Pid: 818 comm: modprobe Not tainted (2.6.27.6eee #1)
> EIP: 0060:[<7818ca71>] EFLAGS: 00010206 CPU: 0
> EIP is at sysfs_find_dirent+0x9/0x23
> EAX: 00000001 EBX: 00000072 ECX: 00000001 EDX: b730b4f0
> ESI: b730b4f0 EDI: fffffff4 EBP: b7311490 ESP: b73ffd34
> DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
> Stack: <<<i'm too lazy>>>>
> Call Trace:
> [<7818cb39>]: __sysfs_add_one+0x10/0x69
> [<7818cba2>]: sysfs_add_one+0x9/0x2f
> [<7818d0bc>]: create_dir+0x3c/0x62
> sysfs_create_dir+0x29/0x3b
> kobject_get
> kobject_add_internal
> kobject_add
> device_add+0x75/0x456
> apic_timer_interrupt+0x28/0x30
> strlcpy+0x11/0x3e
> register_netdevice+0x1f1/0x31a
> ieee80211_register_hw+0x1eb/0x2d0 [mac80211]
> ath5k_pci_probe+0xc27/0x1150 [ath5k]
> find_inode
> pci_device_probe
> driver_probe_device
> .......
> init_ath5k_pci [ath5k]
> .......
> ==============
> Code: dd 85 c0 53 89 c3 74 16 83 38 00 75 0f ba 82 00 00 00 b8 f7 75
> 3c 78 e8 70 da f8 ff ff 03 89 d8 5b c3 56 89 d6 53 8b 58 18 eb 11 <8b>
> 43 10 89 f2 e8 34 cc 04 00 85 c0 74 07 8b 5b 0c 85 db 75 eb
>
> $ AFLAGS=--32 scripts/decodecode < /tmp/oops.txt
> Code: dd 85 c0 53 89 c3 74 16 83 38 00 75 0f ba 82 00 00 00 b8 f7 75
> 3c 78 e8 70 da f8 ff ff 03 89 d8 5b c3 56 89 d6 53 8b 58 18 eb 11 <8b>
> 43 10 89 f2 e8 34 cc 04 00 85 c0 74 07 8b 5b 0c 85 db 75 eb
>
> /tmp/tmp.xJNdgiQwSL.o: file format elf32-i386
>
> Disassembly of section .text:
>
> 00000000 <.text>:
> 0: dd 85 c0 53 89 c3 fldl -0x3c76ac40(%ebp)
> 6: 74 16 je 0x1e
> 8: 83 38 00 cmpl $0x0,(%eax)
> b: 75 0f jne 0x1c
> d: ba 82 00 00 00 mov $0x82,%edx
> 12: b8 f7 75 3c 78 mov $0x783c75f7,%eax
> 17: e8 70 da f8 ff call 0xfff8da8c
> 1c: ff 03 incl (%ebx)
> 1e: 89 d8 mov %ebx,%eax
> 20: 5b pop %ebx
> 21: c3 ret
> 22: 56 push %esi
> 23: 89 d6 mov %edx,%esi
> 25: 53 push %ebx
> 26: 8b 58 18 mov 0x18(%eax),%ebx
> 29: eb 11 jmp 0x3c
>
> /tmp/tmp.xJNdgiQwSL.o: file format elf32-i386
>
> Disassembly of section .text:
>
> 00000000 <.text>:
> 0: 8b 43 10 mov 0x10(%ebx),%eax
> 3: 89 f2 mov %esi,%edx
> 5: e8 34 cc 04 00 call 0x4cc3e
> a: 85 c0 test %eax,%eax
> c: 74 07 je 0x15
> e: 8b 5b 0c mov 0xc(%ebx),%ebx
> 11: 85 db test %ebx,%ebx
> 13: 75 eb jne 0x0
And I had the code completely wrong, oops. Looks like we are bailing
on the strcmp call in this function or something along those lines? I
wish I could be a bigger help with debugging this stuff.
-Dan
000001bc <sysfs_find_dirent>:
* RETURNS:
* Pointer to sysfs_dirent if found, NULL if not.
*/
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const unsigned char *name)
{
1bc: 56 push %esi
1bd: 89 d6 mov %edx,%esi
1bf: 53 push %ebx
struct sysfs_dirent *sd;
for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
1c0: 8b 58 18 mov 0x18(%eax),%ebx
1c3: eb 11 jmp 1d6 <sysfs_find_dirent+0x1a>
if (!strcmp(sd->s_name, name))
1c5: 8b 43 10 mov 0x10(%ebx),%eax
1c8: 89 f2 mov %esi,%edx
1ca: e8 fc ff ff ff call 1cb <sysfs_find_dirent+0xf>
1cf: 85 c0 test %eax,%eax
1d1: 74 07 je 1da <sysfs_find_dirent+0x1e>
struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
const unsigned char *name)
{
struct sysfs_dirent *sd;
for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
1d3: 8b 5b 0c mov 0xc(%ebx),%ebx
1d6: 85 db test %ebx,%ebx
1d8: 75 eb jne 1c5 <sysfs_find_dirent+0x9>
if (!strcmp(sd->s_name, name))
return sd;
return NULL;
}
1da: 89 d8 mov %ebx,%eax
1dc: 5b pop %ebx
1dd: 5e pop %esi
1de: c3 ret
On Sat, Nov 15, 2008 at 09:48:13PM -0800, Luis R. Rodriguez wrote:
> Not sure why its 00000001, nor do I know if its poison. One thing I am
> fairly positive about is that the reason why this was wrong all along
> was because we were trying to get the device's ->driver structure to
> get driver->name but the device won't get its ->driver pointer
> assigned until *after* a successful probe. Lets review the PCI probe:
No, as I later corrected myself, that is pci_dev->driver, but what
mac80211 is using is pci_dev->dev.driver. Which actually happens to be
set up fine. (I verified this by inserting a printk into the ath5k
driver and it all works fine.)
> Yeah don't bother, the issue on this e-mail was fixed, another issue
> has come up though so that is separate.
I think the second issue is just the first issue, but the fix papered
over the first bug.
--
Bob Copeland %% http://www.bobcopeland.com
On Sun, Nov 16, 2008 at 10:05 AM, Johannes Berg
<[email protected]> wrote:
> On Sun, 2008-11-16 at 12:01 -0600, Dan McGee wrote:
>
>> I had not seen this cfg80211 stuff before when modprobing. I also see
>> that cfg80211 is now loaded as a module- was the fact that it was
>> built into the kernel before causing all these problems? That would be
>> my strong suspicion, in which case this entire thread is really a bum
>> bug report, except that compat-wireless should enforce that cfg80211
>> is a module just like it currently enforces that for mac80211, saving
>> someone else this headache.
>
> Let me guess, symbol versioning was off?
/me adds complaint on compat-wireless if cfg80211 is built in
On Fri, Nov 14, 2008 at 9:49 AM, Bob Copeland <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>> Anyway we do set the parent before calling ieee80211_register_hw() by
>> using SET_IEEE80211_DEV(hw, &pdev->dev) (oh ok here is the parent). so
>> when we try to get the name with parent->driver I am not sure if
>> driver has been set yet because ath5k_pci_probe() hasn't finished yet
>> as probe called ath5k_hw_attach(). When does ->driver get set and why
>> would it fail only for ath5k?
>
> Ahh, I believe that would be in pci-driver.c __pci_device_probe, after driver
> probe is run successfully. Yes, we do ieee80211_register_hw from within
> probe... So, how did this ever work again? /me scratches head.
Wow.. ok, yeah I see that, this is a mac8021 bug then unless mac80211
wants us to not callieee80211_register_hw() from probe or its children
which would be weird.
So we need to fix that.
Luis
On Sat, Nov 15, 2008 at 10:05 PM, Luis R. Rodriguez <[email protected]> wrote:
> On Sat, Nov 15, 2008 at 9:53 PM, Bob Copeland <[email protected]> wrote:
>> On Sat, Nov 15, 2008 at 09:48:13PM -0800, Luis R. Rodriguez wrote:
>>> Not sure why its 00000001, nor do I know if its poison. One thing I am
>>> fairly positive about is that the reason why this was wrong all along
>>> was because we were trying to get the device's ->driver structure to
>>> get driver->name but the device won't get its ->driver pointer
>>> assigned until *after* a successful probe. Lets review the PCI probe:
>>
>> No, as I later corrected myself, that is pci_dev->driver, but what
>> mac80211 is using is pci_dev->dev.driver.
>
> Right, the dev for the pci_dev which hasn't yet had a driver assigned
> yet, no? Who assigns the pci_dev's dev's driver pointer and when? Or
> am I still missing something?
Oh nevermind yeah pci_dev->driver = drv; but still who then assigns
the dev's driver?
Luis
On Fri, Nov 14, 2008 at 8:29 PM, Bob Copeland <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 08:05:09PM -0600, Dan McGee wrote:
>> On Fri, Nov 14, 2008 at 6:36 PM, Luis R. Rodriguez
>> Of course, now that I have a kernel with full debug symbols, I can't
>> get it to oops. Figures. I'll keep you updated if I see this error
>> again.
>
> Grasping at straws, is there any chance your kernel didn't match up
> with the compiled modules from compat-wireless? For example, you
> reconfigured the kernel without rebuilding it, then compat-wireless
> picked up your .config? I suppose that could lead to structures being
> defined with wrong offsets...
I doubt it; I'm compiling the whole enchilada in one go so it should
definitely match up. I just reproduced it with a brand new build after
going back to a no-symbols kernel. Here is a transcribed OOPS dump
since this one leaves the machine in a state where it doesn't even
finish booting up. I can *only* get this if I compile without
debugging symbols, and looking at the call trace, you can see the
apic_timer_interrupt call which leads me to believe it is some sort of
timing/race issue that a debug kernel doesn't run into.
-Dan
BUG: unable to handle kernel NULL pointer dereference at 00000082
IP: [<7818ca71>] sysfs_find_dirent+0x9/0x23
Oops: 0000 [#1] PREEMPT
Modules linked in: ath5k(+) mac80211
Pid: 818 comm: modprobe Not tainted (2.6.27.6eee #1)
EIP: 0060:[<7818ca71>] EFLAGS: 00010206 CPU: 0
EIP is at sysfs_find_dirent+0x9/0x23
EAX: 00000001 EBX: 00000072 ECX: 00000001 EDX: b730b4f0
ESI: b730b4f0 EDI: fffffff4 EBP: b7311490 ESP: b73ffd34
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
Stack: <<<i'm too lazy>>>>
Call Trace:
[<7818cb39>]: __sysfs_add_one+0x10/0x69
[<7818cba2>]: sysfs_add_one+0x9/0x2f
[<7818d0bc>]: create_dir+0x3c/0x62
sysfs_create_dir+0x29/0x3b
kobject_get
kobject_add_internal
kobject_add
device_add+0x75/0x456
apic_timer_interrupt+0x28/0x30
strlcpy+0x11/0x3e
register_netdevice+0x1f1/0x31a
ieee80211_register_hw+0x1eb/0x2d0 [mac80211]
ath5k_pci_probe+0xc27/0x1150 [ath5k]
find_inode
pci_device_probe
driver_probe_device
.......
init_ath5k_pci [ath5k]
.......
==============
Code: dd 85 c0 53 89 c3 74 16 83 38 00 75 0f ba 82 00 00 00 b8 f7 75
3c 78 e8 70 da f8 ff ff 03 89 d8 5b c3 56 89 d6 53 8b 58 18 eb 11 <8b>
43 10 89 f2 e8 34 cc 04 00 85 c0 74 07 8b 5b 0c 85 db 75 eb
$ AFLAGS=--32 scripts/decodecode < /tmp/oops.txt
Code: dd 85 c0 53 89 c3 74 16 83 38 00 75 0f ba 82 00 00 00 b8 f7 75
3c 78 e8 70 da f8 ff ff 03 89 d8 5b c3 56 89 d6 53 8b 58 18 eb 11 <8b>
43 10 89 f2 e8 34 cc 04 00 85 c0 74 07 8b 5b 0c 85 db 75 eb
/tmp/tmp.xJNdgiQwSL.o: file format elf32-i386
Disassembly of section .text:
00000000 <.text>:
0: dd 85 c0 53 89 c3 fldl -0x3c76ac40(%ebp)
6: 74 16 je 0x1e
8: 83 38 00 cmpl $0x0,(%eax)
b: 75 0f jne 0x1c
d: ba 82 00 00 00 mov $0x82,%edx
12: b8 f7 75 3c 78 mov $0x783c75f7,%eax
17: e8 70 da f8 ff call 0xfff8da8c
1c: ff 03 incl (%ebx)
1e: 89 d8 mov %ebx,%eax
20: 5b pop %ebx
21: c3 ret
22: 56 push %esi
23: 89 d6 mov %edx,%esi
25: 53 push %ebx
26: 8b 58 18 mov 0x18(%eax),%ebx
29: eb 11 jmp 0x3c
/tmp/tmp.xJNdgiQwSL.o: file format elf32-i386
Disassembly of section .text:
00000000 <.text>:
0: 8b 43 10 mov 0x10(%ebx),%eax
3: 89 f2 mov %esi,%edx
5: e8 34 cc 04 00 call 0x4cc3e
a: 85 c0 test %eax,%eax
c: 74 07 je 0x15
e: 8b 5b 0c mov 0xc(%ebx),%ebx
11: 85 db test %ebx,%ebx
13: 75 eb jne 0x0
And the objdump output from that function, but my novice eyes can't
pick out where the above code corresponds to:
0000027d <__sysfs_add_one>:
27d: 56 push %esi
27e: 89 c6 mov %eax,%esi
280: 53 push %ebx
281: 89 d3 mov %edx,%ebx
283: 8b 00 mov (%eax),%eax
285: 8b 52 10 mov 0x10(%edx),%edx
288: e8 fc ff ff ff call 289 <__sysfs_add_one+0xc>
28d: ba ef ff ff ff mov $0xffffffef,%edx
292: 85 c0 test %eax,%eax
294: 75 4b jne 2e1 <__sysfs_add_one+0x64>
296: 8b 06 mov (%esi),%eax
298: e8 fe fe ff ff call 19b <sysfs_get>
29d: 80 7b 1c 01 cmpb $0x1,0x1c(%ebx)
2a1: 89 43 08 mov %eax,0x8(%ebx)
2a4: 75 0a jne 2b0 <__sysfs_add_one+0x33>
2a6: 8b 46 04 mov 0x4(%esi),%eax
2a9: 85 c0 test %eax,%eax
2ab: 74 03 je 2b0 <__sysfs_add_one+0x33>
2ad: ff 40 28 incl 0x28(%eax)
2b0: ff 46 0c incl 0xc(%esi)
2b3: 83 7b 0c 00 cmpl $0x0,0xc(%ebx)
2b7: 8b 43 08 mov 0x8(%ebx),%eax
2ba: 74 04 je 2c0 <__sysfs_add_one+0x43>
2bc: 0f 0b ud2a
2be: eb fe jmp 2be <__sysfs_add_one+0x41>
2c0: 8d 48 18 lea 0x18(%eax),%ecx
2c3: 8b 50 18 mov 0x18(%eax),%edx
2c6: eb 0e jmp 2d6 <__sysfs_add_one+0x59>
2c8: 8b 43 20 mov 0x20(%ebx),%eax
2cb: 3b 42 20 cmp 0x20(%edx),%eax
2ce: 72 0a jb 2da <__sysfs_add_one+0x5d>
2d0: 8d 4a 0c lea 0xc(%edx),%ecx
2d3: 8b 52 0c mov 0xc(%edx),%edx
2d6: 85 d2 test %edx,%edx
2d8: 75 ee jne 2c8 <__sysfs_add_one+0x4b>
2da: 89 53 0c mov %edx,0xc(%ebx)
2dd: 31 d2 xor %edx,%edx
2df: 89 19 mov %ebx,(%ecx)
2e1: 5b pop %ebx
2e2: 89 d0 mov %edx,%eax
2e4: 5e pop %esi
2e5: c3 ret
On Fri, Nov 14, 2008 at 12:33 PM, Bob Copeland <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>>>> Luis R. Rodriguez wrote:
>>>>> > From the original log:
>>>>> > ath5k_pci 0000:01:00.0: registered as ''
>>>>>
>>>>> That comes from
>>>>>
>>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>>>>
>>>>> which means that may be buggy too as wiphy_name() gets
>>>>>
>>>>> return wiphy->dev.bus_id;
>>>>>
>>>>> and we only have set at that point wiphy->dev.parent IIRC.
>>>>>
>>>>
>>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>>>> phy[x] gets assigned in alloc_hw().
>>>>
>>>> So why is it NULL even if alloc_hw() succeeds ?
>>>
>>> Parent is wiphy->dev.parent not wiphy->dev though
>
> Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
> since wiphy_new in alloc_hw fills it in.
Ah, I see, yes, oh man. Well this is good :)
> Curiouser and curiouser. I look
> forward to this being an obvious and embarrassing bug :)
I sure hope so.
>> So I'm at work for another 4 or 5 hours, but I should have a chance to
>> hack at anything tonight you guys want me to. Let me know if the
>> earlier suggestions in this thread still apply.
>
> I think the printks will still be useful, if only to verify we're on
> the right track. Which compat-wireless version are you using?
>
> BTW from kerneloops, several people are having this with other drivers than
> ath5k.
Yeah I ran into it with iwlagn a few days ago but never found the root
cause, it was sporadic.
Luis
On Fri, Nov 14, 2008 at 06:25:53PM -0600, Dan McGee wrote:
> Can someone point me to the "other thread"? I don't think I was CCed
> on it since I came late to the party.
>
> Someone also inquired about versions. I'm using stable kernel 2.6.27.6
> and compat-wireless-2008-11-13 (10-28 was doing the same thing
> however).
The thread is over there:
http://marc.info/?l=linux-wireless&m=122670266931930&w=2
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 12:41 PM, Luis R. Rodriguez <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 12:33 PM, Bob Copeland <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
>>> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>>>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>>>>> Luis R. Rodriguez wrote:
>>>>>> > From the original log:
>>>>>> > ath5k_pci 0000:01:00.0: registered as ''
>>>>>>
>>>>>> That comes from
>>>>>>
>>>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>>>>>
>>>>>> which means that may be buggy too as wiphy_name() gets
>>>>>>
>>>>>> return wiphy->dev.bus_id;
>>>>>>
>>>>>> and we only have set at that point wiphy->dev.parent IIRC.
>>>>>>
>>>>>
>>>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>>>>> phy[x] gets assigned in alloc_hw().
>>>>>
>>>>> So why is it NULL even if alloc_hw() succeeds ?
>>>>
>>>> Parent is wiphy->dev.parent not wiphy->dev though
>>
>> Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
>> since wiphy_new in alloc_hw fills it in.
>
> Ah, I see, yes, oh man. Well this is good :)
OK then try the patch in the other thread. If that doesn't fix it then
it has to be another issue.
Luis
On Thu, Nov 13, 2008 at 11:46:06PM -0600, Dan McGee wrote:
>
> GDB disassembly:
>
> 0x000002cc <ieee80211_register_hw+357>: sbb %eax,%eax
> 0x000002ce <ieee80211_register_hw+359>: and $0x100,%edx
> 0x000002d4 <ieee80211_register_hw+365>: and $0xf,%al
> 0x000002d6 <ieee80211_register_hw+367>: inc %al
> ^^^ HERE ^^^
I don't think this disassembly is right. inc %al is not a very likely
faulting instruction, especially if we just did a successful AND on the
register. Luis' suggestion to use gdb to directly list the C code is
good; here's another way:
Run scripts/decodecode on the "Code: ...." portion (output below).
There you can see that the faulting instruction is actually
"mov (%eax), %eax" which makes sense because %eax in the register dump
is zero, so it's clearly a null pointer dereference. We're
dereferencing some pointer at offset 0xbc in some structure.
Now you can do objdump -S net/mac80211/main.o to see mixed C and
assembly, and look for a segment of code that matches your disassembly.
Sometimes that can be a bit confusing with inlines and code scheduling,
but that's the nature of the beast.
(I haven't had a chance to look at the actual code yet.)
Disassembly of section .text:
00000000 <.text>:
0: 83 c8 00 or $0x0,%eax
3: 00 00 add %al,(%eax)
5: 21 03 and %eax,(%ebx)
7: c7 83 b4 00 00 00 1c movl $0xb80d491c,0xb4(%ebx)
e: 49 0d b8
11: c7 83 0c 02 00 00 ee movl $0xb80c17ee,0x20c(%ebx)
18: 17 0c b8
1b: 8b 46 1c mov 0x1c(%esi),%eax
1e: 8b 40 7c mov 0x7c(%eax),%eax
21: 8b 80 bc 00 00 00 mov 0xbc(%eax),%eax
27: 6a 00 push $0x0
29: 6a 00 push $0x0
0: 8b 00 mov (%eax),%eax <--- HERE
2: e8 75 64 06 c0 call 0xc006647c
7: 5f pop %edi
8: bf f4 ff ff ff mov $0xfffffff4,%edi
d: 85 c0 test %eax,%eax
f: 89 46 20 mov %eax,0x20(%esi)
12: 5a pop %edx
13: 0f .byte 0xf
14: 84 .byte 0x84
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 08:05:09PM -0600, Dan McGee wrote:
> On Fri, Nov 14, 2008 at 6:36 PM, Luis R. Rodriguez
> Of course, now that I have a kernel with full debug symbols, I can't
> get it to oops. Figures. I'll keep you updated if I see this error
> again.
Grasping at straws, is there any chance your kernel didn't match up
with the compiled modules from compat-wireless? For example, you
reconfigured the kernel without rebuilding it, then compat-wireless
picked up your .config? I suppose that could lead to structures being
defined with wrong offsets...
--
Bob Copeland %% http://www.bobcopeland.com
On Fri, Nov 14, 2008 at 9:02 AM, Bob Copeland <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 1:17 AM, Luis R. Rodriguez <[email protected]> wrote:
>> If our offsets are the same then its probably on line 791:
> [...]
>> 790 name = wiphy_dev(local->hw.wiphy)->driver->name;
>> 791 local->hw.workqueue = create_freezeable_workqueue(name);
>
> I agree, having looked at the objdump output. Hmm, maybe ->driver pointer
> is bad even though I can't see that happening.
Yeah I tried digging through the work queue logic thing and don't see
why name would be NULL. I believe the name comes from the driver's
stack memory when it sets up the pci device driver structure. Only
puzzling thing for me is wiphy_dev() gets the wiphy->dev.parent, that
still hasn't sunken in yet for me, and how that gets you
->driver->name or how this can possibly be NULL.
Anyway we do set the parent before calling ieee80211_register_hw() by
using SET_IEEE80211_DEV(hw, &pdev->dev) (oh ok here is the parent). so
when we try to get the name with parent->driver I am not sure if
driver has been set yet because ath5k_pci_probe() hasn't finished yet
as probe called ath5k_hw_attach(). When does ->driver get set and why
would it fail only for ath5k?
Luis
On Fri, Nov 14, 2008 at 3:13 PM, Luis R. Rodriguez <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 12:41 PM, Luis R. Rodriguez <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 12:33 PM, Bob Copeland <[email protected]> wrote:
>>> On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
>>>> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
>>>>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>>>>>> Luis R. Rodriguez wrote:
>>>>>>> > From the original log:
>>>>>>> > ath5k_pci 0000:01:00.0: registered as ''
>>>>>>>
>>>>>>> That comes from
>>>>>>>
>>>>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>>>>>>
>>>>>>> which means that may be buggy too as wiphy_name() gets
>>>>>>>
>>>>>>> return wiphy->dev.bus_id;
>>>>>>>
>>>>>>> and we only have set at that point wiphy->dev.parent IIRC.
>>>>>>>
>>>>>>
>>>>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>>>>>> phy[x] gets assigned in alloc_hw().
>>>>>>
>>>>>> So why is it NULL even if alloc_hw() succeeds ?
>>>>>
>>>>> Parent is wiphy->dev.parent not wiphy->dev though
>>>
>>> Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
>>> since wiphy_new in alloc_hw fills it in.
>>
>> Ah, I see, yes, oh man. Well this is good :)
>
> OK then try the patch in the other thread. If that doesn't fix it then
> it has to be another issue.
Can someone point me to the "other thread"? I don't think I was CCed
on it since I came late to the party.
Someone also inquired about versions. I'm using stable kernel 2.6.27.6
and compat-wireless-2008-11-13 (10-28 was doing the same thing
however).
-Dan
On Sat, Nov 15, 2008 at 10:06 PM, Luis R. Rodriguez <[email protected]> wrote:
> On Sat, Nov 15, 2008 at 10:05 PM, Luis R. Rodriguez <[email protected]> wrote:
>> On Sat, Nov 15, 2008 at 9:53 PM, Bob Copeland <[email protected]> wrote:
>>> On Sat, Nov 15, 2008 at 09:48:13PM -0800, Luis R. Rodriguez wrote:
>>>> Not sure why its 00000001, nor do I know if its poison. One thing I am
>>>> fairly positive about is that the reason why this was wrong all along
>>>> was because we were trying to get the device's ->driver structure to
>>>> get driver->name but the device won't get its ->driver pointer
>>>> assigned until *after* a successful probe. Lets review the PCI probe:
>>>
>>> No, as I later corrected myself, that is pci_dev->driver, but what
>>> mac80211 is using is pci_dev->dev.driver.
>>
>> Right, the dev for the pci_dev which hasn't yet had a driver assigned
>> yet, no? Who assigns the pci_dev's dev's driver pointer and when? Or
>> am I still missing something?
>
> Oh nevermind yeah pci_dev->driver = drv; but still who then assigns
> the dev's driver?
The answer is drivers/base/dd.c really_probe(). Curious enough guess
what, dev->driver = drv; is assigned *before* the device driver probe
:) (bus probe in this case first, so dev->bus->probe) contrary to the
other way around. So you're absolutely right the patch is not
necessary then. Good catch.
Luis
On Sat, Nov 15, 2008 at 9:53 PM, Bob Copeland <[email protected]> wrote:
> On Sat, Nov 15, 2008 at 09:48:13PM -0800, Luis R. Rodriguez wrote:
>> Not sure why its 00000001, nor do I know if its poison. One thing I am
>> fairly positive about is that the reason why this was wrong all along
>> was because we were trying to get the device's ->driver structure to
>> get driver->name but the device won't get its ->driver pointer
>> assigned until *after* a successful probe. Lets review the PCI probe:
>
> No, as I later corrected myself, that is pci_dev->driver, but what
> mac80211 is using is pci_dev->dev.driver.
Right, the dev for the pci_dev which hasn't yet had a driver assigned
yet, no? Who assigns the pci_dev's dev's driver pointer and when? Or
am I still missing something?
Luis
On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
>> Luis R. Rodriguez wrote:
>>> > From the original log:
>>> > ath5k_pci 0000:01:00.0: registered as ''
>>>
>>> That comes from
>>>
>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
>>>
>>> which means that may be buggy too as wiphy_name() gets
>>>
>>> return wiphy->dev.bus_id;
>>>
>>> and we only have set at that point wiphy->dev.parent IIRC.
>>>
>>
>> Nope, parent is set after that call in SET_IEEE80211_DEV()
>> phy[x] gets assigned in alloc_hw().
>>
>> So why is it NULL even if alloc_hw() succeeds ?
>
> Parent is wiphy->dev.parent not wiphy->dev though
So I'm at work for another 4 or 5 hours, but I should have a chance to
hack at anything tonight you guys want me to. Let me know if the
earlier suggestions in this thread still apply.
-Dan
On Sun, Nov 16, 2008 at 10:38:39AM -0600, Dan McGee wrote:
> On Sun, Nov 16, 2008 at 10:20 AM, Bob Copeland <[email protected]> wrote:
> > As you can see, the modules think "phy0" should be at offset 164, but
> > it's really at 160.
>
> > Also I noticed the original config had CONFIG_DYNAMIC_FTRACE=y, unfortunately
> > turning that off didn't seem to change anything (hmm, I didn't get a build
> > bug, was the breakage fixed in 2.6.27.y?)
>
> Note that HAVE is set if the architecture supports it; CONFIG_FTRACE
> itself was turned off (and CONFIG_DYNAMIC_FTRACE is not even present):
> $ grep FTRACE .config
> CONFIG_HAVE_FTRACE=y
> CONFIG_HAVE_DYNAMIC_FTRACE=y
> # CONFIG_FTRACE is not set
Oh, you are right. Thanks. I probably won't be able to do any more
testing today but I would suggest trying to turn off wireless-related
options in the main kernel config and see if you can get it working
(start with cfg80211 and its dependencies...)
--
Bob Copeland %% http://www.bobcopeland.com
On Sat, Nov 15, 2008 at 6:38 PM, Dan McGee <[email protected]> wrote:
> On Sat, Nov 15, 2008 at 8:11 PM, Dan McGee <[email protected]> wrote:
>> On Fri, Nov 14, 2008 at 11:02 AM, Bob Copeland <[email protected]> wrote:
>>> On Fri, Nov 14, 2008 at 1:17 AM, Luis R. Rodriguez <[email protected]> wrote:
>>>> If our offsets are the same then its probably on line 791:
>>> [...]
>>>> 790 name = wiphy_dev(local->hw.wiphy)->driver->name;
>>>> 791 local->hw.workqueue = create_freezeable_workqueue(name);
>>>
>>> I agree, having looked at the objdump output. Hmm, maybe ->driver pointer
>>> is bad even though I can't see that happening. Dan, can you try adding a
>>> printk before line 790 to see if any of the pointers are null?
>>
>> So I went back and added a few things to the original unpatched code
>> to see what was NULL pointering, just to be sure we were thinking
>> right. Here is the relevant code:
>> printk(KERN_DEBUG "wiphy_dev() : %p\n", wiphy_dev(local->hw.wiphy));
>> printk(KERN_DEBUG "driver : %p\n",
>> wiphy_dev(local->hw.wiphy)->driver);
>> printk(KERN_DEBUG "driver->name: %p\n",
>> wiphy_dev(local->hw.wiphy)->driver->name);
>> name = wiphy_dev(local->hw.wiphy)->driver->name;
>> local->hw.workqueue = create_freezeable_workqueue(name);
>>
>> And the dmesg output:
>> ath5k_pci xxx: registered as ''
>> wiphy_dev() : b730b408
>> driver : 00000001
>> BUG: unalbe to handle kernel NULL pointer dereference at 00000001
>>
>> So we bugged out on trying to print driver->name, which is the same
>> problem we would have hit in the 'name =' line.
>
> I should clarify here- the real bug was when trying to access
> '->driver', as we got the 00000001 poison pointer returned (this is a
> poison value, right?).
Not sure why its 00000001, nor do I know if its poison. One thing I am
fairly positive about is that the reason why this was wrong all along
was because we were trying to get the device's ->driver structure to
get driver->name but the device won't get its ->driver pointer
assigned until *after* a successful probe. Lets review the PCI probe:
/**
* __pci_device_probe()
* @drv: driver to call to check if it wants the PCI device
* @pci_dev: PCI device being probed
*
* returns 0 on success, else error.
* side-effect: pci_dev->driver is set to drv when drv claims pci_dev.
*/
static int
__pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev)
{
const struct pci_device_id *id;
int error = 0;
if (!pci_dev->driver && drv->probe) {
error = -ENODEV;
id = pci_match_device(drv, pci_dev);
if (id)
error = pci_call_probe(drv, pci_dev, id);
if (error >= 0) {
pci_dev->driver = drv;
error = 0;
}
}
return error;
}
So unless probe was successful (pci_call_probe which calls
drv->probe()) we don't update pci_dev->driver pointer.
> The above sequence of events was what took place when trying to load
> the module on startup. To see if other things had an effect, I
> disabled module autoloading during the boot sequence and got slightly
> different results but it looks to be the same type of problem:
>
> registered as ''
> wiphy_dev: b730d740
> driver: 7fffffff
> driver->name: ffffffff
> BUG: unable to handle kernel paging request at ffffffff
>
> One more note- booting with the 2.6.27.6 shipped wireless modules
> (mac80211 and ath5k) has always been working fine. It is only when I
> try to run compat-wireless on top of this kernel that we are seeing
> issues.
This is interesting, but then again the fact that it was working *all
along* for other devices is interesting too as it shouldn't have.
> Theoretically that means this should be bisectable if we
> really can't figure it out, but I'm not sure how practical that is.
Yeah don't bother, the issue on this e-mail was fixed, another issue
has come up though so that is separate.
Luis
On Thu, Nov 13, 2008 at 9:46 PM, Dan McGee <[email protected]> wrote:
> Also reported here a few times, I've been seeing this bug every time I
> try to load up the ath5k module in my kernel:
> http://www.kerneloops.org/search.php?search=ieee80211_register_hw&btnG=Function+Search
>
> If you guys have any suggestions, I'd love to hear them. I
> disassembled the code in question but am not very good with these
> things. It looks to be somewhere between lines 804-825 in
> net/mac80211/main.c.
>
> -Dan
>
> ath5k_pci 0000:01:00.0: PCI INT A -> GSI 18 (level, low) -> IRQ 18
> ath5k_pci 0000:01:00.0: setting latency timer to 64
> ath5k_pci 0000:01:00.0: registered as ''
> BUG: unable to handle kernel NULL pointer dereference at 00000000
> IP: [<b80c1252>] :mac80211:ieee80211_register_hw+0x10f/0x2d6
> *pde = 00000000
> Oops: 0000 [#1] PREEMPT
> Modules linked in: ath5k(+) mac80211
>
> Pid: 818, comm: modprobe Not tainted (2.6.27.6eee #3)
> EIP: 0060:[<b80c1252>] EFLAGS: 00010286 CPU: 0
> EIP is at ieee80211_register_hw+0x10f/0x2d6 [mac80211]
I don't know assembly but if you compile with debugging symbols you
can pin point the line at which your EIP points to with gdb.
Run
gdb /lib/modules/`uname -r`/kernel/net/mac80211.ko
Then
l *(ieee80211_register_hw+0x10f)
If our offsets are the same then its probably on line 791:
(gdb) l *(ieee80211_register_hw+0x10f)
0x2df is in ieee80211_register_hw (net/mac80211/main.c:791).
786 mdev->type = ARPHRD_IEEE80211;
787 mdev->header_ops = &ieee80211_header_ops;
788 mdev->set_multicast_list = ieee80211_master_set_multicast_list;
789
790 name = wiphy_dev(local->hw.wiphy)->driver->name;
791 local->hw.workqueue = create_freezeable_workqueue(name);
792 if (!local->hw.workqueue) {
793 result = -ENOMEM;
794 goto fail_workqueue;
795 }
I have run into this before but on a very rare occasion and it was
with iwlagn, and I wasn't able to find the culprit. Do you run into
this all the time?
Luis
On Fri, Nov 14, 2008 at 04:25:53PM -0800, Dan McGee wrote:
> On Fri, Nov 14, 2008 at 3:13 PM, Luis R. Rodriguez <[email protected]> wrote:
> > On Fri, Nov 14, 2008 at 12:41 PM, Luis R. Rodriguez <[email protected]> wrote:
> >> On Fri, Nov 14, 2008 at 12:33 PM, Bob Copeland <[email protected]> wrote:
> >>> On Fri, Nov 14, 2008 at 2:33 PM, Dan McGee <[email protected]> wrote:
> >>>> On Fri, Nov 14, 2008 at 12:37 PM, Luis R. Rodriguez <[email protected]> wrote:
> >>>>> On Fri, Nov 14, 2008 at 10:33 AM, Sujith <[email protected]> wrote:
> >>>>>> Luis R. Rodriguez wrote:
> >>>>>>> > From the original log:
> >>>>>>> > ath5k_pci 0000:01:00.0: registered as ''
> >>>>>>>
> >>>>>>> That comes from
> >>>>>>>
> >>>>>>> dev_info(&pdev->dev, "registered as '%s'\n", wiphy_name(hw->wiphy));
> >>>>>>>
> >>>>>>> which means that may be buggy too as wiphy_name() gets
> >>>>>>>
> >>>>>>> return wiphy->dev.bus_id;
> >>>>>>>
> >>>>>>> and we only have set at that point wiphy->dev.parent IIRC.
> >>>>>>>
> >>>>>>
> >>>>>> Nope, parent is set after that call in SET_IEEE80211_DEV()
> >>>>>> phy[x] gets assigned in alloc_hw().
> >>>>>>
> >>>>>> So why is it NULL even if alloc_hw() succeeds ?
> >>>>>
> >>>>> Parent is wiphy->dev.parent not wiphy->dev though
> >>>
> >>> Sujith is right, wiphy->dev.bus_id should be set by the time that's printed,
> >>> since wiphy_new in alloc_hw fills it in.
> >>
> >> Ah, I see, yes, oh man. Well this is good :)
> >
> > OK then try the patch in the other thread. If that doesn't fix it then
> > it has to be another issue.
>
> Can someone point me to the "other thread"? I don't think I was CCed
> on it since I came late to the party.
>
> Someone also inquired about versions. I'm using stable kernel 2.6.27.6
> and compat-wireless-2008-11-13 (10-28 was doing the same thing
> however).
Bounced you a copy. Please give it a try.
Luis