2003-05-29 12:55:02

by William Lee Irwin III

[permalink] [raw]
Subject: list_head debugging patch

This appears to get the kernel to crap its pants in very, very short
order. Given the number of things going wrong, I almost wonder if I
did something wrong. Things get real ugly, really, really fast.

Some review could be helpful, especially since some hard problems come
up right from the start and I can't get a clean boot with this.

Enjoy.


-- wli

Some oopses with the patch applied (virgin 2.5.70):

Unable to handle kernel paging request at virtual address afafafaf
printing eip:
c01796d6
*pde = 00000000
Oops: 0002 [#1]
CPU: 0
EIP: 0060:[<c01796d6>] Not tainted
EFLAGS: 00010282
EIP is at dput+0x5a6/0x5e0
eax: 9e9e9e9e ebx: c019e7f0 ecx: c16e7dbc edx: afafafaf
esi: c16e7d4c edi: 0000000f ebp: cf851ef8 esp: cf851ed8
ds: 007b es: 007b ss: 0068
Process rcS (pid: 51, threadinfo=cf850000 task=cfec8080)
Stack: c16e7d4c c0422ad0 c16e7d4c c16e7d4c cf851ef8 cf850000 cfec8c80 0000000f
cf851f20 c01218a8 c16e7d4c cfec8624 cf99a3e8 cf9e106c c16e7d4c cfec8c80
bfffee1c 00000000 cf851f4c c01236b3 cfec8c80 00040001 00030002 00000000
Call Trace:
[<c01218a8>] release_task+0x278/0x300
[<c01236b3>] wait_task_zombie+0x143/0x1a0
[<c0123c4a>] sys_wait4+0x21a/0x270
[<c011bb20>] default_wake_function+0x0/0x30
[<c012c980>] sys_rt_sigprocmask+0x90/0x120
[<c011bb20>] default_wake_function+0x0/0x30
[<c010b3ab>] syscall_call+0x7/0xb

Code: 89 02 74 03 89 50 04 c7 46 70 9e 9e 9e 9e c7 41 04 af af af

elem = c03a77b4, elem->prev = c0420d80, elem->prev->next = c03a73b4
------------[ cut here ]------------
kernel BUG at include/linux/list.h:39!
invalid operand: 0000 [#1]
CPU: 0
EIP: 0060:[<c0279bbc>] Not tainted
EFLAGS: 00010286
EIP is at ide_register_subdriver+0xdc/0x200
eax: 00000047 ebx: c03a77b4 ecx: 00000002 edx: c0363d98
esi: c14e6000 edi: c04213f4 ebp: c14e7ed4 esp: c14e7eb4
ds: 007b es: 007b ss: 0068
Process swapper (pid: 1, threadinfo=c14e6000 task=cfef5a80)
Stack: c0320c20 c03a77b4 c0420d80 c03a73b4 c0421628 c04213f4 c0421348 00000000
c14e7eec c027aa64 c04213f4 c03a7700 00000001 c14e6000 c14e7f10 c0279008
c04213f4 00000202 00000002 00000000 00000000 c0421348 00000000 c14e7f24
Call Trace:
[<c027aa64>] idedefault_attach+0x24/0x50
[<c0279008>] ata_attach+0xa8/0x190
[<c02716ab>] probe_hwif_init+0x7b/0x80
[<c0285d4a>] ide_setup_pci_device+0x7a/0x90
[<c026e89b>] piix_init_one+0x3b/0x50
[<c03f03bd>] ide_scan_pcidev+0x5d/0x70
[<c03f0408>] ide_scan_pcibus+0x38/0x120
[<c03f013e>] probe_for_hwifs+0x8e/0x90
[<c03ef769>] init_ide_data+0x39/0x50
[<c03f0148>] ide_init_builtin_drivers+0x8/0x20
[<c03f018f>] ide_init+0x2f/0x60
[<c03da85b>] do_initcalls+0x2b/0xa0
[<c012efd2>] init_workqueues+0x12/0x30
[<c01050a6>] init+0x36/0x1c0
[<c0105070>] init+0x0/0x1c0
[<c010917d>] kernel_thread_helper+0x5/0x18

Code: 0f 0b 27 00 72 f3 31 c0 8b 55 0c 8b 82 b4 00 00 00 8b 50 04


--- linux-2.5.70/include/linux/list.h 2003-05-26 18:00:41.000000000 -0700
+++ pgcl-2.5.70-2/include/linux/list.h 2003-05-29 05:32:43.000000000 -0700
@@ -30,6 +30,22 @@
(ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

+static inline void __list_head_check(const struct list_head *elem)
+{
+ if (elem->prev->next != elem) {
+ printk(KERN_CRIT "elem = %p, elem->prev = %p, "
+ "elem->prev->next = %p\n",
+ elem, elem->prev, elem->prev->next);
+ BUG();
+ }
+ if (elem->next->prev != elem) {
+ printk(KERN_CRIT "elem = %p, elem->next = %p, "
+ "elem->next->prev = %p\n",
+ elem, elem->next, elem->next->prev);
+ BUG();
+ }
+}
+
/*
* Insert a new entry between two known consecutive entries.
*
@@ -56,6 +72,7 @@
*/
static inline void list_add(struct list_head *new, struct list_head *head)
{
+ __list_head_check(head);
__list_add(new, head, head->next);
}

@@ -69,6 +86,7 @@
*/
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
+ __list_head_check(head);
__list_add(new, head->prev, head);
}

@@ -136,7 +154,10 @@
*/
static inline void list_del(struct list_head *entry)
{
+ __list_head_check(entry);
__list_del(entry->prev, entry->next);
+ entry->prev = (void *)0x7c7c7c7c;
+ entry->next = (void *)0x8d8d8d8d;
}
/**
* list_del_rcu - deletes entry from list without re-initialization
@@ -156,6 +177,7 @@
*/
static inline void list_del_init(struct list_head *entry)
{
+ __list_head_check(entry);
__list_del(entry->prev, entry->next);
INIT_LIST_HEAD(entry);
}
@@ -167,6 +189,8 @@
*/
static inline void list_move(struct list_head *list, struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
__list_del(list->prev, list->next);
list_add(list, head);
}
@@ -179,6 +203,8 @@
static inline void list_move_tail(struct list_head *list,
struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
__list_del(list->prev, list->next);
list_add_tail(list, head);
}
@@ -189,6 +215,7 @@
*/
static inline int list_empty(struct list_head *head)
{
+ __list_head_check(head);
return head->next == head;
}

@@ -199,6 +226,9 @@
struct list_head *last = list->prev;
struct list_head *at = head->next;

+ __list_head_check(head);
+ __list_head_check(list);
+
first->prev = head;
head->next = first;

@@ -262,7 +292,11 @@
* or 1 entry) most of the time.
*/
#define __list_for_each(pos, head) \
- for (pos = (head)->next; pos != (head); pos = pos->next)
+ for (pos = (head)->next; \
+ pos != (head); \
+ __list_head_check(pos), \
+ __list_head_check(head), \
+ pos = pos->next)

/**
* list_for_each_prev - iterate over a list backwards
@@ -290,11 +324,13 @@
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry(pos, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member), \
- prefetch(pos->member.next); \
- &pos->member != (head); \
- pos = list_entry(pos->member.next, typeof(*pos), member), \
- prefetch(pos->member.next))
+ for (pos = list_entry((head)->next, typeof(*(pos)), member), \
+ prefetch((pos)->member.next); \
+ &(pos)->member != (head); \
+ pos = list_entry((pos)->member.next, typeof(*(pos)), member),\
+ __list_head_check(head), \
+ __list_head_check(&(pos)->member), \
+ prefetch((pos)->member.next))

/**
* list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
@@ -304,10 +340,10 @@
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry_safe(pos, n, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member), \
- n = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.next, typeof(*n), member))
+ for (pos = list_entry((head)->next, typeof(*(pos)), member), \
+ n = list_entry((pos)->member.next, typeof(*(pos)), member);\
+ &(pos)->member != (head); \
+ pos = n, n = list_entry((n)->member.next, typeof(*(n)), member))

/**
* list_for_each_rcu - iterate over an rcu-protected list
@@ -401,9 +437,15 @@
{
if (n->pprev)
__hlist_del(n);
+ n->next = (void *)0x9e9e9e9e;
+ n->pprev = (void *)0xafafafaf;
}

-#define hlist_del_rcu hlist_del /* list_del_rcu is identical too? */
+static __inline__ void hlist_del_rcu(struct hlist_node *n)
+{
+ if (n->pprev)
+ __hlist_del(n);
+}

static __inline__ void hlist_del_init(struct hlist_node *n)
{


2003-05-29 19:10:01

by Morten Helgesen

[permalink] [raw]
Subject: Re: list_head debugging patch

Hey,

On Thursday 29 May 2003 15:08, William Lee Irwin III wrote:
> This appears to get the kernel to crap its pants in very, very
> short order. Given the number of things going wrong, I almost
> wonder if I did something wrong. Things get real ugly, really,
> really fast.
>

[snip]

I gave this a go - booted without problems. I did some
untaring/copying/deleting and didn`t see anything unusual, but a
'dbench 8' died right away.

elem = c0509b18, elem->next = c166d9c4, elem->next->prev = c3f71ea8
kernel BUG at include/linux/list.h:45!
invalid operand: 0000 [#1]
CPU: 1
EIP: 0060:[<c02c5178>] Not tainted
EFLAGS: 00010096
EIP is at clear_queue_congested+0x78/0xb0
eax: 00000047 ebx: c0509b18 ecx: c04b8d24 edx: c043a8a0
esi: c0509b10 edi: c050db9c ebp: c3f75c00 esp: c3f75be8
ds: 007b es: 007b ss: 0068
Process pdflush (pid: 10, threadinfo=c3f74000 task=c3f73940)
Stack: c03d5f60 c0509b18 c166d9c4 c3f71ea8 c0661524 c0695c64 c3f75c20
c02c7c35
c050db9c 00000001 c0695c64 c04b9440 c050db9c c0661524 c3f75c68
c02c8210
c050db9c c0661524 c0695c64 00000200 00000000 014d98bc 00000008
00000008
Call Trace:
[<c02c7c35>] attempt_merge+0xc5/0x100
[<c02c8210>] __make_request+0x490/0x610
[<c02c84c7>] generic_make_request+0x137/0x1d0
[<c01704ba>] bio_alloc+0xda/0x1c0
[<c02c85b3>] submit_bio+0x53/0xa0
[<c016e1b8>] __block_write_full_page+0x228/0x420
[<c016f965>] block_write_full_page+0xd5/0xe0
[<c01b6390>] ext3_get_block+0x0/0xa0
[<c01b736a>] ext3_writepage+0x23a/0x440
[<c01b6390>] ext3_get_block+0x0/0xa0
[<c01b7110>] bget_one+0x0/0x10
[<c0197f8c>] mpage_writepages+0x32c/0x5f0
[<c01b7130>] ext3_writepage+0x0/0x440
[<c011ed02>] schedule+0x2f2/0x6f0
[<c014b627>] do_writepages+0x37/0x40
[<c0195600>] __sync_single_inode+0x2c0/0x8f0
[<c0196192>] sync_sb_inodes+0x312/0x7f0
[<c02c7b64>] blk_congestion_wait+0x94/0xa0
[<c01967f2>] writeback_inodes+0x182/0x2b0
[<c014b359>] background_writeout+0xa9/0xe0
[<c014bf57>] __pdflush+0x2f7/0x5f0
[<c011f136>] preempt_schedule+0x36/0x50
[<c011d870>] schedule_tail+0xc0/0xe0
[<c014c250>] pdflush+0x0/0x20
[<c014c261>] pdflush+0x11/0x20
[<c014b2b0>] background_writeout+0x0/0xe0
[<c01074e5>] kernel_thread_helper+0x5/0x10

Code: 0f 0b 2d 00 95 42 3d c0 8b 46 08 39 d8 75 09 83 c4 10 5b 5e


wli also requested dmesg, so here goes:

Linux version 2.5.70list.h (morten@marge) (gcc version 3.2.2) #2 SMP
Thu May 29 21:00:25 CEST 2003
Video mode to be used for restore is f00
BIOS-provided physical RAM map:
BIOS-e820: 0000000000000000 - 00000000000a0000 (usable)
BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
BIOS-e820: 0000000000100000 - 0000000004000000 (usable)
BIOS-e820: 00000000fec00000 - 00000000fec01000 (reserved)
BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
BIOS-e820: 00000000ffff0000 - 0000000100000000 (reserved)
64MB LOWMEM available.
found SMP MP-table at 000f5810
hm, page 000f5000 reserved twice.
hm, page 000f6000 reserved twice.
hm, page 000f1000 reserved twice.
hm, page 000f2000 reserved twice.
On node 0 totalpages: 16384
DMA zone: 4096 pages, LIFO batch:1
Normal zone: 12288 pages, LIFO batch:3
HighMem zone: 0 pages, LIFO batch:1
ACPI: Unable to locate RSDP
Intel MultiProcessor Specification v1.1
Virtual Wire compatibility mode.
OEM ID: OEM00000 Product ID: PROD00000000 APIC at: 0xFEE00000
Processor #0 6:5 APIC version 17
Processor #1 6:5 APIC version 17
I/O APIC #2 Version 17 at 0xFEC00000.
Enabling APIC mode: Flat. Using 1 I/O APICs
Processors: 2
Building zonelist for node : 0
Kernel command line: BOOT_IMAGE=2.5.70-list.h ro root=302
console=ttyS0,115200 console=tty0
Initializing CPU#0
PID hash table entries: 512 (order 9: 4096 bytes)
Detected 342.640 MHz processor.
Console: colour VGA+ 80x25
Calibrating delay loop... 673.79 BogoMIPS
Memory: 60132k/65536k available (2846k kernel code, 4924k reserved,
968k data, 180k init, 0k highmem)
Security Scaffold v1.0.0 initialized
Dentry cache hash table entries: 8192 (order: 3, 32768 bytes)
Inode-cache hash table entries: 4096 (order: 2, 16384 bytes)
Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
-> /dev
-> /dev/console
-> /root
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 512K
Enabling fast FPU save and restore... done.
Checking 'hlt' instruction... OK.
POSIX conformance testing by UNIFIX
CPU0: Intel Pentium II (Deschutes) stepping 02
per-CPU timeslice cutoff: 1462.92 usecs.
task migration cache decay timeout: 2 msecs.
enabled ExtINT on CPU#0
ESR value before enabling vector: 00000000
ESR value after enabling vector: 00000000
Booting processor 1/1 eip 2000
Initializing CPU#1
masked ExtINT on CPU#1
ESR value before enabling vector: 00000000
ESR value after enabling vector: 00000000
Calibrating delay loop... 684.03 BogoMIPS
CPU: L1 I cache: 16K, L1 D cache: 16K
CPU: L2 cache: 512K
CPU1: Intel Pentium II (Deschutes) stepping 02
Total of 2 processors activated (1357.82 BogoMIPS).
ENABLING IO-APIC IRQs
Setting 2 in the phys_id_present_map
...changing IO-APIC physical APIC ID to 2 ... ok.
..TIMER: vector=0x31 pin1=2 pin2=0
testing the IO APIC.......................

.................................... done.
Using local APIC timer interrupts.
calibrating APIC timer ...
..... CPU clock speed is 342.0548 MHz.
..... host bus clock speed is 68.0509 MHz.
checking TSC synchronization across 2 CPUs: passed.
Starting migration thread for cpu 0
Bringing up 1
CPU 1 IS NOW UP!
Starting migration thread for cpu 1
CPUS done 2
mtrr: v2.0 (20020519)
mtrr: your CPUs had inconsistent fixed MTRR settings
mtrr: probably your BIOS does not setup all CPUs.
mtrr: corrected configuration.
Initializing RT netlink socket
PCI: PCI BIOS revision 2.10 entry at 0xfb3c0, last bus=1
PCI: Using configuration type 1
BIO: pool of 256 setup, 14Kb (56 bytes/bio)
biovec pool[0]: 1 bvecs: 116 entries (12 bytes)
biovec pool[1]: 4 bvecs: 116 entries (48 bytes)
biovec pool[2]: 16 bvecs: 58 entries (192 bytes)
biovec pool[3]: 64 bvecs: 29 entries (768 bytes)
biovec pool[4]: 128 bvecs: 14 entries (1536 bytes)
biovec pool[5]: 256 bvecs: 7 entries (3072 bytes)
ACPI: Subsystem revision 20030522
tbxfroot-0324 [04] acpi_find_root_pointer: RSDP structure not found,
AE_NOT_FOUND Flags=8
ACPI: System description tables not found
tbxface-0084: *** Error: acpi_load_tables: Could not get RSDP,
AE_NOT_FOUND
tbxface-0134: *** Error: acpi_load_tables: Could not load tables:
AE_NOT_FOUND
ACPI: Unable to load the System Description Tables
Linux Plug and Play Support v0.96 (c) Adam Belay
PnPBIOS: Scanning system for PnP BIOS support...
PnPBIOS: Found PnP BIOS installation structure at 0xc00fbff0
PnPBIOS: PnP BIOS version 1.0, entry 0xf0000:0xc018, dseg 0xf0000
PnPBIOS: 14 nodes reported by PnP BIOS; 14 recorded by driver
block request queues:
4/128 requests per read queue
4/128 requests per write queue
enter congestion at 15
exit congestion at 17
Linux Kernel Card Services 3.1.22
options: [pci] [cardbus]
drivers/usb/core/usb.c: registered new driver usbfs
drivers/usb/core/usb.c: registered new driver hub
ACPI: ACPI tables contain no PCI IRQ routing entries
PCI: Invalid ACPI-PCI IRQ routing table
PCI: Probing PCI hardware
PCI: Probing PCI hardware (bus 00)
PCI: Using IRQ router PIIX [8086/7110] at 00:07.0
PCI->APIC IRQ transform: (B0,I7,P3) -> 19
PCI->APIC IRQ transform: (B0,I10,P0) -> 18
PCI->APIC IRQ transform: (B1,I0,P0) -> 16
IA-32 Microcode Update Driver: v1.11 <[email protected]>
Enabling SEP on CPU 1
Enabling SEP on CPU 0
Journalled Block Device driver loaded
Installing knfsd (copyright (C) 1996 [email protected]).
udf: registering filesystem
Capability LSM initialized
Initializing Cryptographic API
Limiting direct PCI/PCI transfers.
isapnp: Scanning for PnP cards...
isapnp: No Plug & Play device found
aty128fb: Rage128 BIOS located at e8000000
aty128fb: Rage128 Pro PF (AGP) [chip rev 0x1] 32M 128-bit SDR SGRAM
(1:1)
fb0: ATY Rage128 frame buffer device on Rage128 Pro PF (AGP)
aty128fb: Rage128 MTRR set to ON
Console: switching to colour frame buffer device 80x30
pty: 256 Unix98 ptys configured
Linux agpgart interface v0.100 (c) Dave Jones
agpgart: Detected an Intel 440BX Chipset.
agpgart: Maximum main memory to use for agp memory: 28M
agpgart: AGP aperture is 64M @ 0xe0000000
Serial: 8250/16550 driver $Revision: 1.90 $ IRQ sharing disabled
ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
ttyS1 at I/O 0x2f8 (irq = 3) is a 16550A
FDC 0 is a post-1991 82077
loop: loaded (max 8 devices)
3c59x: Donald Becker and others. http://www.scyld.com/network/vortex.html
00:0a.0: 3Com PCI 3c905B Cyclone 100baseTx at 0xe400. Vers LK1.1.19
Uniform Multi-Platform E-IDE driver Revision: 7.00alpha2
ide: Assuming 33MHz system bus speed for PIO modes; override with
idebus=xx
PIIX4: IDE controller at PCI slot 00:07.1
PIIX4: chipset revision 1
PIIX4: not 100% native mode: will probe irqs later
ide0: BM-DMA at 0xf000-0xf007, BIOS settings: hda:DMA, hdb:pio
ide1: BM-DMA at 0xf008-0xf00f, BIOS settings: hdc:pio, hdd:pio
hda: WDC WD273BA, ATA DISK drive
ide0 at 0x1f0-0x1f7,0x3f6 on irq 14
hda: max request size: 128KiB
hda: host protected area => 1
hda: 53464320 sectors (27374 MB) w/1961KiB Cache, CHS=53040/16/63,
UDMA(33)
hda: hda1 hda2 hda3 hda4
Console: switching to colour frame buffer device 80x30
drivers/usb/host/uhci-hcd.c: USB Universal Host Controller Interface
driver v2.1
uhci-hcd 00:07.2: Intel Corp. 82371AB/EB/MB PIIX4
uhci-hcd 00:07.2: irq 19, io base 0000e000
Please use the 'usbfs' filetype instead, the 'usbdevfs' name is
deprecated.
uhci-hcd 00:07.2: new USB bus registered, assigned bus number 1
hub 1-0:0: USB hub found
hub 1-0:0: 2 ports detected
mice: PS/2 mouse device common for all mice
serio: i8042 AUX port at 0x60,0x64 irq 12
input: AT Set 2 keyboard on isa0060/serio0
serio: i8042 KBD port at 0x60,0x64 irq 1
oprofile: using NMI interrupt.
NET4: Linux TCP/IP 1.0 for NET4.0
IP: routing cache hash table of 256 buckets, 4Kbytes
TCP: Hash tables configured (established 2048 bind 2730)
NET4: Unix domain sockets 1.0/SMP for Linux NET4.0.
BIOS EDD facility v0.09 2003-Jan-22, 1 devices found
kjournald starting. Commit interval 5 seconds
EXT3-fs: mounted filesystem with ordered data mode.
VFS: Mounted root (ext3 filesystem) readonly.
Freeing unused kernel memory: 180k freed
Adding 265064k swap on /dev/hda3. Priority:-1 extents:1
EXT3 FS 2.4-0.9.16, 02 Dec 2001 on hda2, internal journal
kjournald starting. Commit interval 5 seconds
EXT3 FS 2.4-0.9.16, 02 Dec 2001 on hda1, internal journal
EXT3-fs: mounted filesystem with ordered data mode.
kjournald starting. Commit interval 5 seconds
EXT3 FS 2.4-0.9.16, 02 Dec 2001 on hda4, internal journal
EXT3-fs: mounted filesystem with ordered data mode.

== Morten

--

"Livet er ikke for nybegynnere" - sitat fra en klok person.

Morten Helgesen
UNIX System Administrator & C Developer
Nextframe AS
[email protected] / 93445641
http://www.nextframe.net

2003-05-29 19:25:40

by William Lee Irwin III

[permalink] [raw]
Subject: Re: list_head debugging patch

On Thursday 29 May 2003 15:08, William Lee Irwin III wrote:
>> This appears to get the kernel to crap its pants in very, very
>> short order. Given the number of things going wrong, I almost
>> wonder if I did something wrong. Things get real ugly, really,
>> really fast.

On Thu, May 29, 2003 at 09:22:43PM +0200, Morten Helgesen wrote:
> [snip]
> I gave this a go - booted without problems. I did some
> untaring/copying/deleting and didn`t see anything unusual, but a
> 'dbench 8' died right away.
[...]
> EIP is at clear_queue_congested+0x78/0xb0

clear_queue_congested() is doing an opportunistic check for list_empty()
without taking a lock. The patch basically changes list_empty() to look
at elements of the list instead of just pieces of the head. As opposed
to auditing for this, could you remove the __list_head_check() from
list_empty() and try again?

Thanks.

-- wli

2003-05-29 19:46:21

by Morten Helgesen

[permalink] [raw]
Subject: Re: list_head debugging patch


one more ...

elem = c3a6464c, elem->prev = c11d59e8, elem->prev->next = c28cc1ec
------------[ cut here ]------------
kernel BUG at include/linux/list.h:39!
invalid operand: 0000 [#1]
CPU: 0
EIP: 0060:[<c016b21c>] Not tainted
EFLAGS: 00010286
EIP is at file_kill+0x2c/0x150
eax: 00000047 ebx: c3a6464c ecx: c39f8d20 edx: c340a000
esi: c01b3ef0 edi: c3ff7d64 ebp: c340bf54 esp: c340bf3c
ds: 007b es: 007b ss: 0068
Process fixdep (pid: 2404, threadinfo=c340a000 task=c1166710)
Stack: c03d5f20 c3a6464c c11d59e8 c28cc1ec c3a6464c c01b3ef0 c340bf78
c016adf2
c3a6464c c3a6464c c099c490 c09995e4 c3a6464c 00000000 00000000
c340bf98
c0169223 c3a6464c c21841f4 c001d364 c21841f4 00000004 c340a000
c340bfbc
Call Trace:
[<c01b3ef0>] ext3_release_file+0x0/0x60
[<c016adf2>] __fput+0xc2/0x140
[<c0169223>] filp_close+0xd3/0x130
[<c0169310>] sys_close+0x90/0x110
[<c0109bdf>] syscall_call+0x7/0xb

Code: 0f 0b 27 00 95 42 3d c0 8b 13 8b 42 04 39 d8 74 22 89 44 24

== Morten

--

"Livet er ikke for nybegynnere" - sitat fra en klok person.

Morten Helgesen
UNIX System Administrator & C Developer
Nextframe AS
[email protected] / 93445641
http://www.nextframe.net

2003-05-29 20:00:38

by William Lee Irwin III

[permalink] [raw]
Subject: Re: list_head debugging patch

On Thu, May 29, 2003 at 09:58:52PM +0200, Morten Helgesen wrote:
> one more ...
> elem = c3a6464c, elem->prev = c11d59e8, elem->prev->next = c28cc1ec
> ------------[ cut here ]------------
> kernel BUG at include/linux/list.h:39!
> invalid operand: 0000 [#1]
> CPU: 0
> EIP: 0060:[<c016b21c>] Not tainted
> EFLAGS: 00010286
> EIP is at file_kill+0x2c/0x150

Same thing; nuke the __list_head_check() check in list_empty() please.


-- wli

2003-05-29 20:50:12

by Morten Helgesen

[permalink] [raw]
Subject: Re: list_head debugging patch

On Thursday 29 May 2003 22:13, William Lee Irwin III wrote:
> On Thu, May 29, 2003 at 09:58:52PM +0200, Morten Helgesen wrote:
> > one more ...
> > elem = c3a6464c, elem->prev = c11d59e8, elem->prev->next =
> > c28cc1ec ------------[ cut here ]------------
> > kernel BUG at include/linux/list.h:39!
> > invalid operand: 0000 [#1]
> > CPU: 0
> > EIP: 0060:[<c016b21c>] Not tainted
> > EFLAGS: 00010286
> > EIP is at file_kill+0x2c/0x150
>
> Same thing; nuke the __list_head_check() check in list_empty()
> please.

Ok, after having nuked __list_head_check() in list_empty() I can`t
seem to trigger any more list corruption on this box.

--

"Livet er ikke for nybegynnere" - sitat fra en klok person.

Morten Helgesen
UNIX System Administrator & C Developer
Nextframe AS
[email protected] / 93445641
http://www.nextframe.net

2003-05-29 20:56:04

by William Lee Irwin III

[permalink] [raw]
Subject: Re: list_head debugging patch

On Thursday 29 May 2003 22:13, William Lee Irwin III wrote:
>> Same thing; nuke the __list_head_check() check in list_empty()
>> please.

On Thu, May 29, 2003 at 11:03:19PM +0200, Morten Helgesen wrote:
> Ok, after having nuked __list_head_check() in list_empty() I can`t
> seem to trigger any more list corruption on this box.

Well, that's a hopeful sign; at some point maybe IDE will stop oopsing
on me with it.


-- wli

2003-05-29 23:44:12

by William Lee Irwin III

[permalink] [raw]
Subject: Re: list_head debugging patch

On Thursday 29 May 2003 22:13, William Lee Irwin III wrote:
>>> Same thing; nuke the __list_head_check() check in list_empty()
>>> please.

On Thu, May 29, 2003 at 11:03:19PM +0200, Morten Helgesen wrote:
>> Ok, after having nuked __list_head_check() in list_empty() I can`t
>> seem to trigger any more list corruption on this box.

On Thu, May 29, 2003 at 02:09:08PM -0700, William Lee Irwin III wrote:
> Well, that's a hopeful sign; at some point maybe IDE will stop oopsing
> on me with it.

This time fixed up for list_emptY() and list_for_each*_safe()


--- linux-2.5.70/include/linux/list.h 2003-05-26 18:00:41.000000000 -0700
+++ pgcl-2.5.70-2/include/linux/list.h 2003-05-29 14:26:39.000000000 -0700
@@ -30,6 +30,22 @@
(ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

+static inline void __list_head_check(const struct list_head *elem)
+{
+ if (elem->prev->next != elem) {
+ printk(KERN_CRIT "elem = %p, elem->prev = %p, "
+ "elem->prev->next = %p\n",
+ elem, elem->prev, elem->prev->next);
+ BUG();
+ }
+ if (elem->next->prev != elem) {
+ printk(KERN_CRIT "elem = %p, elem->next = %p, "
+ "elem->next->prev = %p\n",
+ elem, elem->next, elem->next->prev);
+ BUG();
+ }
+}
+
/*
* Insert a new entry between two known consecutive entries.
*
@@ -56,6 +72,7 @@
*/
static inline void list_add(struct list_head *new, struct list_head *head)
{
+ __list_head_check(head);
__list_add(new, head, head->next);
}

@@ -69,6 +86,7 @@
*/
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
+ __list_head_check(head);
__list_add(new, head->prev, head);
}

@@ -136,7 +154,10 @@
*/
static inline void list_del(struct list_head *entry)
{
+ __list_head_check(entry);
__list_del(entry->prev, entry->next);
+ entry->prev = (void *)0x7c7c7c7c;
+ entry->next = (void *)0x8d8d8d8d;
}
/**
* list_del_rcu - deletes entry from list without re-initialization
@@ -156,6 +177,7 @@
*/
static inline void list_del_init(struct list_head *entry)
{
+ __list_head_check(entry);
__list_del(entry->prev, entry->next);
INIT_LIST_HEAD(entry);
}
@@ -167,6 +189,8 @@
*/
static inline void list_move(struct list_head *list, struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
__list_del(list->prev, list->next);
list_add(list, head);
}
@@ -179,6 +203,8 @@
static inline void list_move_tail(struct list_head *list,
struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
__list_del(list->prev, list->next);
list_add_tail(list, head);
}
@@ -199,6 +225,9 @@
struct list_head *last = list->prev;
struct list_head *at = head->next;

+ __list_head_check(head);
+ __list_head_check(list);
+
first->prev = head;
head->next = first;

@@ -213,6 +242,8 @@
*/
static inline void list_splice(struct list_head *list, struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
if (!list_empty(list))
__list_splice(list, head);
}
@@ -227,6 +258,8 @@
static inline void list_splice_init(struct list_head *list,
struct list_head *head)
{
+ __list_head_check(list);
+ __list_head_check(head);
if (!list_empty(list)) {
__list_splice(list, head);
INIT_LIST_HEAD(list);
@@ -248,8 +281,9 @@
* @head: the head for your list.
*/
#define list_for_each(pos, head) \
- for (pos = (head)->next, prefetch(pos->next); pos != (head); \
- pos = pos->next, prefetch(pos->next))
+ for (pos = (head)->next, prefetch((pos)->next); pos != (head); \
+ __list_head_check(pos), __list_head_check(head), \
+ pos = (pos)->next, prefetch((pos)->next))

/**
* __list_for_each - iterate over a list
@@ -262,7 +296,11 @@
* or 1 entry) most of the time.
*/
#define __list_for_each(pos, head) \
- for (pos = (head)->next; pos != (head); pos = pos->next)
+ for (pos = (head)->next; \
+ pos != (head); \
+ __list_head_check(pos), \
+ __list_head_check(head), \
+ pos = pos->next)

/**
* list_for_each_prev - iterate over a list backwards
@@ -270,8 +308,9 @@
* @head: the head for your list.
*/
#define list_for_each_prev(pos, head) \
- for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
- pos = pos->prev, prefetch(pos->prev))
+ for (pos = (head)->prev, prefetch((pos)->prev); pos != (head); \
+ __list_head_check(pos), __list_head_check(head), \
+ pos = (pos)->prev, prefetch((pos)->prev))

/**
* list_for_each_safe - iterate over a list safe against removal of list entry
@@ -280,8 +319,9 @@
* @head: the head for your list.
*/
#define list_for_each_safe(pos, n, head) \
- for (pos = (head)->next, n = pos->next; pos != (head); \
- pos = n, n = pos->next)
+ for (pos = (head)->next, n = (pos)->next; pos != (head); \
+ __list_head_check(n), __list_head_check(head), \
+ pos = n, n = (pos)->next)

/**
* list_for_each_entry - iterate over list of given type
@@ -290,11 +330,28 @@
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry(pos, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member), \
- prefetch(pos->member.next); \
- &pos->member != (head); \
- pos = list_entry(pos->member.next, typeof(*pos), member), \
- prefetch(pos->member.next))
+ for (pos = list_entry((head)->next, typeof(*(pos)), member), \
+ prefetch((pos)->member.next); \
+ &(pos)->member != (head); \
+ pos = list_entry((pos)->member.next, typeof(*(pos)), member),\
+ __list_head_check(head), \
+ __list_head_check(&(pos)->member), \
+ prefetch((pos)->member.next))
+
+/**
+ * list_for_each_entry_prev - iterate over a list backwards
+ * @pos: the type * to use as a loop counter.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_prev(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*(pos)), member), \
+ prefetch((pos)->member.prev); \
+ &(pos)->member != (head); \
+ pos = list_entry((pos)->member.prev, typeof(*(pos)), member),\
+ __list_head_check(head), \
+ __list_head_check(&(pos)->member), \
+ prefetch((pos)->member.prev))

/**
* list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
@@ -304,10 +361,11 @@
* @member: the name of the list_struct within the struct.
*/
#define list_for_each_entry_safe(pos, n, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member), \
- n = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.next, typeof(*n), member))
+ for (pos = list_entry((head)->next, typeof(*(pos)), member), \
+ n = list_entry((pos)->member.next, typeof(*(pos)), member);\
+ &(pos)->member != (head); \
+ __list_head_check(head), __list_head_check(&(n)->member), \
+ pos = n, n = list_entry((n)->member.next, typeof(*(n)), member))

/**
* list_for_each_rcu - iterate over an rcu-protected list
@@ -401,9 +459,15 @@
{
if (n->pprev)
__hlist_del(n);
+ n->next = (void *)0x9e9e9e9e;
+ n->pprev = (void *)0xafafafaf;
}

-#define hlist_del_rcu hlist_del /* list_del_rcu is identical too? */
+static __inline__ void hlist_del_rcu(struct hlist_node *n)
+{
+ if (n->pprev)
+ __hlist_del(n);
+}

static __inline__ void hlist_del_init(struct hlist_node *n)
{

2003-05-30 11:39:25

by Alan

[permalink] [raw]
Subject: Re: list_head debugging patch

On Iau, 2003-05-29 at 22:09, William Lee Irwin III wrote:
> On Thursday 29 May 2003 22:13, William Lee Irwin III wrote:
> >> Same thing; nuke the __list_head_check() check in list_empty()
> >> please.
>
> On Thu, May 29, 2003 at 11:03:19PM +0200, Morten Helgesen wrote:
> > Ok, after having nuked __list_head_check() in list_empty() I can`t
> > seem to trigger any more list corruption on this box.
>
> Well, that's a hopeful sign; at some point maybe IDE will stop oopsing
> on me with it.

The IDE code has real list mangling bugs at probe. They are fixed in -ac
but I'm still waiting for the taskfile stuff to get sorted so I can do
a sane merge of the stuff pending.


Subject: Re: list_head debugging patch


On 30 May 2003, Alan Cox wrote:

> On Iau, 2003-05-29 at 22:09, William Lee Irwin III wrote:
> > On Thursday 29 May 2003 22:13, William Lee Irwin III wrote:
> > >> Same thing; nuke the __list_head_check() check in list_empty()
> > >> please.
> >
> > On Thu, May 29, 2003 at 11:03:19PM +0200, Morten Helgesen wrote:
> > > Ok, after having nuked __list_head_check() in list_empty() I can`t
> > > seem to trigger any more list corruption on this box.
> >
> > Well, that's a hopeful sign; at some point maybe IDE will stop oopsing
> > on me with it.
>
> The IDE code has real list mangling bugs at probe. They are fixed in -ac
> but I'm still waiting for the taskfile stuff to get sorted so I can do
> a sane merge of the stuff pending.

List mangling at probe is fixed in 2.5.69-ac1, but there are more bugs
with different triggerability.
--
Bartlomiej

2003-05-31 02:09:37

by William Lee Irwin III

[permalink] [raw]
Subject: Re: list_head debugging patch

On 30 May 2003, Alan Cox wrote:
>> The IDE code has real list mangling bugs at probe. They are fixed in -ac
>> but I'm still waiting for the taskfile stuff to get sorted so I can do
>> a sane merge of the stuff pending.

On Fri, May 30, 2003 at 02:17:02PM +0200, Bartlomiej Zolnierkiewicz wrote:
> List mangling at probe is fixed in 2.5.69-ac1, but there are more bugs
> with different triggerability.

Bartlomiej fixed up everything I could see during boot, things have
other, probably unrelated issues for me now (and mainline runs fine),
and he's finding something I can't see myself.


-- wli