> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 54b123d..c0a07aa 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -59,7 +59,6 @@ static struct request *blk_flush_complete_seq(struct request_queue *q,
> ?static void blk_flush_complete_seq_end_io(struct request_queue *q,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned seq, int error)
> ?{
> - ? ? ? bool was_empty = elv_queue_empty(q);
> ? ? ? ?struct request *next_rq;
>
> ? ? ? ?next_rq = blk_flush_complete_seq(q, seq, error);
> @@ -68,7 +67,7 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
> ? ? ? ? * Moving a request silently to empty queue_head may stall the
> ? ? ? ? * queue. ?Kick the queue in those cases.
> ? ? ? ? */
> - ? ? ? if (was_empty && next_rq)
> + ? ? ? if (next_rq)
> ? ? ? ? ? ? ? ?__blk_run_queue(q);
> ?}
>
...
> diff --git a/block/elevator.c b/block/elevator.c
> index a9fe237..d5d17a4 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -619,8 +619,6 @@ void elv_quiesce_end(struct request_queue *q)
...
> -int elv_queue_empty(struct request_queue *q)
> -{
> - ? ? ? struct elevator_queue *e = q->elevator;
> -
> - ? ? ? if (!list_empty(&q->queue_head))
> - ? ? ? ? ? ? ? return 0;
> -
> - ? ? ? if (e->ops->elevator_queue_empty_fn)
> - ? ? ? ? ? ? ? return e->ops->elevator_queue_empty_fn(q);
> -
> - ? ? ? return 1;
> -}
> -EXPORT_SYMBOL(elv_queue_empty);
> -
Your latest 'for-2.6.39/stack-unplug' rebase (commit 7703acb01e)
misses removing a call to elv_queue_empty() in
block/blk-flush.c:flush_data_end_io()
CC block/blk-flush.o
block/blk-flush.c: In function ?flush_data_end_io?:
block/blk-flush.c:266: error: implicit declaration of function ?elv_queue_empty?
On Thu, Mar 03 2011 at 4:23pm -0500,
Mike Snitzer <[email protected]> wrote:
> > diff --git a/block/blk-flush.c b/block/blk-flush.c
> > index 54b123d..c0a07aa 100644
> > --- a/block/blk-flush.c
> > +++ b/block/blk-flush.c
> > @@ -59,7 +59,6 @@ static struct request *blk_flush_complete_seq(struct request_queue *q,
> > static void blk_flush_complete_seq_end_io(struct request_queue *q,
> > unsigned seq, int error)
> > {
> > - bool was_empty = elv_queue_empty(q);
> > struct request *next_rq;
> >
> > next_rq = blk_flush_complete_seq(q, seq, error);
> > @@ -68,7 +67,7 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
> > * Moving a request silently to empty queue_head may stall the
> > * queue. Kick the queue in those cases.
> > */
> > - if (was_empty && next_rq)
> > + if (next_rq)
> > __blk_run_queue(q);
> > }
> >
> ...
> > diff --git a/block/elevator.c b/block/elevator.c
> > index a9fe237..d5d17a4 100644
> > --- a/block/elevator.c
> > +++ b/block/elevator.c
> > @@ -619,8 +619,6 @@ void elv_quiesce_end(struct request_queue *q)
> ...
> > -int elv_queue_empty(struct request_queue *q)
> > -{
> > - struct elevator_queue *e = q->elevator;
> > -
> > - if (!list_empty(&q->queue_head))
> > - return 0;
> > -
> > - if (e->ops->elevator_queue_empty_fn)
> > - return e->ops->elevator_queue_empty_fn(q);
> > -
> > - return 1;
> > -}
> > -EXPORT_SYMBOL(elv_queue_empty);
> > -
>
> Your latest 'for-2.6.39/stack-unplug' rebase (commit 7703acb01e)
> misses removing a call to elv_queue_empty() in
> block/blk-flush.c:flush_data_end_io()
>
> CC block/blk-flush.o
> block/blk-flush.c: In function ‘flush_data_end_io’:
> block/blk-flush.c:266: error: implicit declaration of function ‘elv_queue_empty’
This allows me to compile:
diff --git a/block/blk-flush.c b/block/blk-flush.c
index de5ae6e..671fa9d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -263,10 +263,9 @@ static bool blk_kick_flush(struct request_queue *q)
static void flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
- bool was_empty = elv_queue_empty(q);
/* after populating an empty queue, kick it to avoid stall */
- if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty)
+ if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
__blk_run_queue(q);
}
I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
kernel, when I try an fsync heavy workload to a request-based mpath
device (the kernel ultimately goes down in flames, I've yet to look at
the crashdump I took)
Initializing cgroup subsys cpuset
Initializing cgroup subsys cpu
Linux version 2.6.38-rc6-snitm+ (root@rhel6) (gcc version 4.4.5 20110116 (Red Hat 4.4.5-5) (GCC) ) #2 SMP Thu Mar 3 16:32:23 EST 2011
Command line: ro root=UUID=e0236db2-5a38-4d48-8bf5-55675671dee6 console=ttyS0 rhgb quiet SYSFONT=latarcyrheb-sun16 LANG=en_US.UTF-8 KEYTABLE=us rd_plytheme=charge crashkernel=auto
BIOS-provided physical RAM map:
BIOS-e820: 0000000000000000 - 000000000009f400 (usable)
BIOS-e820: 000000000009f400 - 00000000000a0000 (reserved)
BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
BIOS-e820: 0000000000100000 - 000000007fffd000 (usable)
BIOS-e820: 000000007fffd000 - 0000000080000000 (reserved)
BIOS-e820: 00000000fffbc000 - 0000000100000000 (reserved)
NX (Execute Disable) protection: active
DMI 2.4 present.
DMI: Bochs Bochs, BIOS Bochs 01/01/2007
e820 update range: 0000000000000000 - 0000000000010000 (usable) ==> (reserved)
e820 remove range: 00000000000a0000 - 0000000000100000 (usable)
No AGP bridge found
last_pfn = 0x7fffd max_arch_pfn = 0x400000000
MTRR default type: write-back
MTRR fixed ranges enabled:
00000-9FFFF write-back
A0000-BFFFF uncachable
C0000-FFFFF write-protect
MTRR variable ranges enabled:
0 base 00E0000000 mask FFE0000000 uncachable
1 disabled
2 disabled
3 disabled
4 disabled
5 disabled
6 disabled
7 disabled
PAT not supported by CPU.
found SMP MP-table at [ffff8800000f7fd0] f7fd0
initial memory mapped : 0 - 20000000
init_memory_mapping: 0000000000000000-000000007fffd000
0000000000 - 007fe00000 page 2M
007fe00000 - 007fffd000 page 4k
kernel direct mapping tables up to 7fffd000 @ 1fffc000-20000000
RAMDISK: 37b50000 - 37ff0000
crashkernel: memory value expected
ACPI: RSDP 00000000000f7f80 00014 (v00 BOCHS )
ACPI: RSDT 000000007fffde10 00034 (v01 BOCHS BXPCRSDT 00000001 BXPC 00000001)
ACPI: FACP 000000007ffffe40 00074 (v01 BOCHS BXPCFACP 00000001 BXPC 00000001)
ACPI: DSDT 000000007fffdfd0 01E22 (v01 BXPC BXDSDT 00000001 INTL 20090123)
ACPI: FACS 000000007ffffe00 00040
ACPI: SSDT 000000007fffdf80 00044 (v01 BOCHS BXPCSSDT 00000001 BXPC 00000001)
ACPI: APIC 000000007fffde90 0007A (v01 BOCHS BXPCAPIC 00000001 BXPC 00000001)
ACPI: HPET 000000007fffde50 00038 (v01 BOCHS BXPCHPET 00000001 BXPC 00000001)
ACPI: Local APIC address 0xfee00000
No NUMA configuration found
Faking a node at 0000000000000000-000000007fffd000
Initmem setup node 0 0000000000000000-000000007fffd000
NODE_DATA [000000007ffe9000 - 000000007fffcfff]
kvm-clock: Using msrs 12 and 11
kvm-clock: cpu 0, msr 0:1875141, boot clock
[ffffea0000000000-ffffea0001bfffff] PMD -> [ffff88007d600000-ffff88007f1fffff] on node 0
Zone PFN ranges:
DMA 0x00000010 -> 0x00001000
DMA32 0x00001000 -> 0x00100000
Normal empty
Movable zone start PFN for each node
early_node_map[2] active PFN ranges
0: 0x00000010 -> 0x0000009f
0: 0x00000100 -> 0x0007fffd
On node 0 totalpages: 524172
DMA zone: 56 pages used for memmap
DMA zone: 2 pages reserved
DMA zone: 3925 pages, LIFO batch:0
DMA32 zone: 7112 pages used for memmap
DMA32 zone: 513077 pages, LIFO batch:31
ACPI: PM-Timer IO Port: 0xb008
ACPI: Local APIC address 0xfee00000
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
ACPI: IOAPIC (id[0x02] address[0xfec00000] gsi_base[0])
IOAPIC[0]: apic_id 2, version 17, address 0xfec00000, GSI 0-23
ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
ACPI: INT_SRC_OVR (bus 0 bus_irq 5 global_irq 5 high level)
ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
ACPI: INT_SRC_OVR (bus 0 bus_irq 10 global_irq 10 high level)
ACPI: INT_SRC_OVR (bus 0 bus_irq 11 global_irq 11 high level)
ACPI: IRQ0 used by override.
ACPI: IRQ2 used by override.
ACPI: IRQ5 used by override.
ACPI: IRQ9 used by override.
ACPI: IRQ10 used by override.
ACPI: IRQ11 used by override.
Using ACPI (MADT) for SMP configuration information
ACPI: HPET id: 0x8086a201 base: 0xfed00000
SMP: Allowing 2 CPUs, 0 hotplug CPUs
nr_irqs_gsi: 40
Allocating PCI resources starting at 80000000 (gap: 80000000:7ffbc000)
Booting paravirtualized kernel on KVM
setup_percpu: NR_CPUS:4 nr_cpumask_bits:4 nr_cpu_ids:2 nr_node_ids:1
PERCPU: Embedded 474 pages/cpu @ffff88007f200000 s1912768 r8192 d20544 u2097152
pcpu-alloc: s1912768 r8192 d20544 u2097152 alloc=1*2097152
pcpu-alloc: [0] 0 [0] 1
kvm-clock: cpu 0, msr 0:7f3d2141, primary cpu clock
Built 1 zonelists in Node order, mobility grouping on. Total pages: 517002
Policy zone: DMA32
Kernel command line: ro root=UUID=e0236db2-5a38-4d48-8bf5-55675671dee6 console=ttyS0 rhgb quiet SYSFONT=latarcyrheb-sun16 LANG=en_US.UTF-8 KEYTABLE=us rd_plytheme=charge crashkernel=auto
PID hash table entries: 4096 (order: 3, 32768 bytes)
Checking aperture...
No AGP bridge found
Memory: 2037496k/2097140k available (3571k kernel code, 452k absent, 59192k reserved, 3219k data, 3504k init)
Hierarchical RCU implementation.
RCU-based detection of stalled CPUs is disabled.
NR_IRQS:4352 nr_irqs:512 16
Console: colour VGA+ 80x25
console [ttyS0] enabled
Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
... MAX_LOCKDEP_SUBCLASSES: 8
... MAX_LOCK_DEPTH: 48
... MAX_LOCKDEP_KEYS: 8191
... CLASSHASH_SIZE: 4096
... MAX_LOCKDEP_ENTRIES: 16384
... MAX_LOCKDEP_CHAINS: 32768
... CHAINHASH_SIZE: 16384
memory used by lock dependency info: 6367 kB
per task-struct memory footprint: 2688 bytes
ODEBUG: 11 of 11 active objects replaced
ODEBUG: selftest passed
hpet clockevent registered
Detected 1995.090 MHz processor.
Calibrating delay loop (skipped) preset value.. 3990.18 BogoMIPS (lpj=1995090)
pid_max: default: 32768 minimum: 301
Security Framework initialized
SELinux: Initializing.
SELinux: Starting in permissive mode
Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
Mount-cache hash table entries: 256
Initializing cgroup subsys ns
ns_cgroup deprecated: consider using the 'clone_children' flag without the ns_cgroup.
Initializing cgroup subsys cpuacct
Initializing cgroup subsys devices
Initializing cgroup subsys freezer
Initializing cgroup subsys net_cls
mce: CPU supports 10 MCE banks
ACPI: Core revision 20110112
ftrace: allocating 16994 entries in 67 pages
Setting APIC routing to flat
..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
CPU0: Intel QEMU Virtual CPU version 0.12.5 stepping 03
Performance Events: unsupported p6 CPU model 2 no PMU driver, software events only.
lockdep: fixing up alternatives.
Booting Node 0, Processors #1 Ok.
kvm-clock: cpu 1, msr 0:7f5d2141, secondary cpu clock
Brought up 2 CPUs
Total of 2 processors activated (7980.36 BogoMIPS).
NET: Registered protocol family 16
ACPI: bus type pci registered
PCI: Using configuration type 1 for base access
mtrr: your CPUs had inconsistent variable MTRR settings
mtrr: your CPUs had inconsistent MTRRdefType settings
mtrr: probably your BIOS does not setup all CPUs.
mtrr: corrected configuration.
bio: create slab <bio-0> at 0
ACPI: EC: Look up EC in DSDT
ACPI: Interpreter enabled
ACPI: (supports S0 S5)
ACPI: Using IOAPIC for interrupt routing
ACPI: No dock devices found.
PCI: Ignoring host bridge windows from ACPI; if necessary, use "pci=use_crs" and report a bug
ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff])
pci_root PNP0A03:00: host bridge window [io 0x0000-0x0cf7] (ignored)
pci_root PNP0A03:00: host bridge window [io 0x0d00-0xffff] (ignored)
pci_root PNP0A03:00: host bridge window [mem 0x000a0000-0x000bffff] (ignored)
pci_root PNP0A03:00: host bridge window [mem 0xe0000000-0xfebfffff] (ignored)
pci 0000:00:00.0: [8086:1237] type 0 class 0x000600
pci 0000:00:01.0: [8086:7000] type 0 class 0x000601
pci 0000:00:01.1: [8086:7010] type 0 class 0x000101
pci 0000:00:01.1: reg 20: [io 0xc000-0xc00f]
pci 0000:00:01.2: [8086:7020] type 0 class 0x000c03
pci 0000:00:01.2: reg 20: [io 0xc020-0xc03f]
pci 0000:00:01.3: [8086:7113] type 0 class 0x000680
pci 0000:00:01.3: quirk: [io 0xb000-0xb03f] claimed by PIIX4 ACPI
pci 0000:00:01.3: quirk: [io 0xb100-0xb10f] claimed by PIIX4 SMB
pci 0000:00:02.0: [1013:00b8] type 0 class 0x000300
pci 0000:00:02.0: reg 10: [mem 0xf0000000-0xf1ffffff pref]
pci 0000:00:02.0: reg 14: [mem 0xf2000000-0xf2000fff]
pci 0000:00:03.0: [1af4:1002] type 0 class 0x000500
pci 0000:00:03.0: reg 10: [io 0xc040-0xc05f]
pci 0000:00:04.0: [1af4:1001] type 0 class 0x000100
pci 0000:00:04.0: reg 10: [io 0xc080-0xc0bf]
pci 0000:00:05.0: [1af4:1001] type 0 class 0x000100
pci 0000:00:05.0: reg 10: [io 0xc0c0-0xc0ff]
pci 0000:00:06.0: [1af4:1000] type 0 class 0x000200
pci 0000:00:06.0: reg 10: [io 0xc100-0xc11f]
pci 0000:00:06.0: reg 14: [mem 0xf2001000-0xf2001fff]
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
ACPI: PCI Interrupt Link [LNKA] (IRQs 5 *10 11)
ACPI: PCI Interrupt Link [LNKB] (IRQs 5 *10 11)
ACPI: PCI Interrupt Link [LNKC] (IRQs 5 10 *11)
ACPI: PCI Interrupt Link [LNKD] (IRQs 5 10 *11)
vgaarb: device added: PCI:0000:00:02.0,decodes=io+mem,owns=io+mem,locks=none
vgaarb: loaded
SCSI subsystem initialized
libata version 3.00 loaded.
PCI: Using ACPI for IRQ routing
PCI: pci_cache_line_size set to 64 bytes
reserve RAM buffer: 000000000009f400 - 000000000009ffff
reserve RAM buffer: 000000007fffd000 - 000000007fffffff
NetLabel: Initializing
NetLabel: domain hash size = 128
NetLabel: protocols = UNLABELED CIPSOv4
NetLabel: unlabeled traffic allowed by default
HPET: 3 timers in total, 0 timers will be used for per-cpu timer
hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
hpet0: 3 comparators, 64-bit 100.000000 MHz counter
Switching to clocksource kvm-clock
Switched to NOHz mode on CPU #0
Switched to NOHz mode on CPU #1
pnp: PnP ACPI init
ACPI: bus type pnp registered
pnp 00:00: [bus 00-ff]
pnp 00:00: [io 0x0cf8-0x0cff]
pnp 00:00: [io 0x0000-0x0cf7 window]
pnp 00:00: [io 0x0d00-0xffff window]
pnp 00:00: [mem 0x000a0000-0x000bffff window]
pnp 00:00: [mem 0xe0000000-0xfebfffff window]
pnp 00:00: Plug and Play ACPI device, IDs PNP0a03 (active)
pnp 00:01: [io 0x0070-0x0071]
pnp 00:01: [irq 8]
pnp 00:01: [io 0x0072-0x0077]
pnp 00:01: Plug and Play ACPI device, IDs PNP0b00 (active)
pnp 00:02: [io 0x0060]
pnp 00:02: [io 0x0064]
pnp 00:02: [irq 1]
pnp 00:02: Plug and Play ACPI device, IDs PNP0303 (active)
pnp 00:03: [irq 12]
pnp 00:03: Plug and Play ACPI device, IDs PNP0f13 (active)
pnp 00:04: [io 0x03f2-0x03f5]
pnp 00:04: [io 0x03f7]
pnp 00:04: [irq 6]
pnp 00:04: [dma 2]
pnp 00:04: Plug and Play ACPI device, IDs PNP0700 (active)
pnp 00:05: [mem 0xfed00000-0xfed003ff]
pnp 00:05: Plug and Play ACPI device, IDs PNP0103 (active)
pnp: PnP ACPI: found 6 devices
ACPI: ACPI bus type pnp unregistered
pci_bus 0000:00: resource 0 [io 0x0000-0xffff]
pci_bus 0000:00: resource 1 [mem 0x00000000-0xffffffffff]
NET: Registered protocol family 2
IP route cache hash table entries: 65536 (order: 7, 524288 bytes)
TCP established hash table entries: 262144 (order: 10, 4194304 bytes)
TCP bind hash table entries: 65536 (order: 10, 5242880 bytes)
TCP: Hash tables configured (established 262144 bind 65536)
TCP reno registered
UDP hash table entries: 1024 (order: 5, 196608 bytes)
UDP-Lite hash table entries: 1024 (order: 5, 196608 bytes)
NET: Registered protocol family 1
pci 0000:00:00.0: Limiting direct PCI/PCI transfers
pci 0000:00:01.0: PIIX3: Enabling Passive Release
pci 0000:00:01.0: Activating ISA DMA hang workarounds
pci 0000:00:02.0: Boot video device
PCI: CLS 0 bytes, default 64
Trying to unpack rootfs image as initramfs...
Freeing initrd memory: 4736k freed
DMA-API: preallocated 32768 debug entries
DMA-API: debugging enabled by kernel config
audit: initializing netlink socket (disabled)
type=2000 audit(1299188678.444:1): initialized
HugeTLB registered 2 MB page size, pre-allocated 0 pages
VFS: Disk quotas dquot_6.5.2
Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
msgmni has been set to 3988
SELinux: Registering netfilter hooks
cryptomgr_test used greatest stack depth: 6496 bytes left
Block layer SCSI generic (bsg) driver version 0.4 loaded (major 253)
io scheduler noop registered
io scheduler deadline registered (default)
io scheduler cfq registered
pci_hotplug: PCI Hot Plug PCI Core version: 0.5
pciehp: PCI Express Hot Plug Controller Driver version: 0.4
acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
acpiphp: Slot [1] registered
acpiphp: Slot [2] registered
acpiphp: Slot [3] registered
acpiphp: Slot [4] registered
acpiphp: Slot [5] registered
acpiphp: Slot [6] registered
acpiphp: Slot [7] registered
acpiphp: Slot [8] registered
acpiphp: Slot [9] registered
acpiphp: Slot [10] registered
acpiphp: Slot [11] registered
acpiphp: Slot [12] registered
acpiphp: Slot [13] registered
acpiphp: Slot [14] registered
acpiphp: Slot [15] registered
acpiphp: Slot [16] registered
acpiphp: Slot [17] registered
acpiphp: Slot [18] registered
acpiphp: Slot [19] registered
acpiphp: Slot [20] registered
acpiphp: Slot [21] registered
acpiphp: Slot [22] registered
acpiphp: Slot [23] registered
acpiphp: Slot [24] registered
acpiphp: Slot [25] registered
acpiphp: Slot [26] registered
acpiphp: Slot [27] registered
acpiphp: Slot [28] registered
acpiphp: Slot [29] registered
acpiphp: Slot [30] registered
acpiphp: Slot [31] registered
input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
ACPI: Power Button [PWRF]
ACPI: acpi_idle registered with cpuidle
Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
Non-volatile memory driver v1.3
Linux agpgart interface v0.103
brd: module loaded
loop: module loaded
ata_piix 0000:00:01.1: version 2.13
ata_piix 0000:00:01.1: setting latency timer to 64
scsi0 : ata_piix
scsi1 : ata_piix
ata1: PATA max MWDMA2 cmd 0x1f0 ctl 0x3f6 bmdma 0xc000 irq 14
ata2: PATA max MWDMA2 cmd 0x170 ctl 0x376 bmdma 0xc008 irq 15
i8042: PNP: PS/2 Controller [PNP0303:KBD,PNP0f13:MOU] at 0x60,0x64 irq 1,12
serio: i8042 KBD port at 0x60,0x64 irq 1
serio: i8042 AUX port at 0x60,0x64 irq 12
mousedev: PS/2 mouse device common for all mice
input: AT Translated Set 2 keyboard as /devices/platform/i8042/serio0/input/input1
rtc_cmos 00:01: rtc core: registered rtc_cmos as rtc0
rtc0: alarms up to one day, 114 bytes nvram, hpet irqs
cpuidle: using governor ladder
cpuidle: using governor menu
nf_conntrack version 0.5.0 (16384 buckets, 65536 max)
ip_tables: (C) 2000-2006 Netfilter Core Team
TCP cubic registered
NET: Registered protocol family 17
registered taskstats version 1
IMA: No TPM chip found, activating TPM-bypass!
rtc_cmos 00:01: setting system clock to 2011-03-03 21:44:38 UTC (1299188678)
Freeing unused kernel memory: 3504k freed
Write protecting the kernel read-only data: 6144k
Freeing unused kernel memory: 508k freed
Freeing unused kernel memory: 164k freed
mknod used greatest stack depth: 5296 bytes left
modprobe used greatest stack depth: 5080 bytes left
mknod used greatest stack depth: 4792 bytes left
input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input2
dracut: dracut-004-35.el6
udev: starting version 147
udevd (70): /proc/70/oom_adj is deprecated, please use /proc/70/oom_score_adj instead.
dracut: Starting plymouth daemon
Refined TSC clocksource calibration: 1994.951 MHz.
ACPI: PCI Interrupt Link [LNKC] enabled at IRQ 11
virtio-pci 0000:00:03.0: PCI INT A -> Link[LNKC] -> GSI 11 (level, high) -> IRQ 11
virtio-pci 0000:00:03.0: setting latency timer to 64
ACPI: PCI Interrupt Link [LNKD] enabled at IRQ 10
virtio-pci 0000:00:04.0: PCI INT A -> Link[LNKD] -> GSI 10 (level, high) -> IRQ 10
virtio-pci 0000:00:04.0: setting latency timer to 64
ACPI: PCI Interrupt Link [LNKA] enabled at IRQ 10
virtio-pci 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10
virtio-pci 0000:00:05.0: setting latency timer to 64
ACPI: PCI Interrupt Link [LNKB] enabled at IRQ 11
virtio-pci 0000:00:06.0: PCI INT A -> Link[LNKB] -> GSI 11 (level, high) -> IRQ 11
virtio-pci 0000:00:06.0: setting latency timer to 64
modprobe used greatest stack depth: 4768 bytes left
vda: vda1 vda2 vda3
vdb: unknown partition table
modprobe used greatest stack depth: 4672 bytes left
EXT3-fs: barriers not enabled
kjournald starting. Commit interval 5 seconds
EXT3-fs (vda3): mounted filesystem with ordered data mode
dracut: Remounting /dev/disk/by-uuid/e0236db2-5a38-4d48-8bf5-55675671dee6 with -o barrier=1,ro
kjournald starting. Commit interval 5 seconds
EXT3-fs (vda3): mounted filesystem with ordered data mode
dracut: Mounted root filesystem /dev/vda3
dracut: Loading SELinux policy
SELinux: Disabled at runtime.
SELinux: Unregistering netfilter hooks
type=1404 audit(1299188681.051:2): selinux=0 auid=4294967295 ses=4294967295
load_policy used greatest stack depth: 3664 bytes left
dracut: /sbin/load_policy: Can't load policy: No such file or directory
dracut: Switching root
readahead: starting
udev: starting version 147
ip used greatest stack depth: 3592 bytes left
piix4_smbus 0000:00:01.3: SMBus Host Controller at 0xb100, revision 0
virtio-pci 0000:00:06.0: irq 40 for MSI/MSI-X
virtio-pci 0000:00:06.0: irq 41 for MSI/MSI-X
virtio-pci 0000:00:06.0: irq 42 for MSI/MSI-X
device-mapper: uevent: version 1.0.3
device-mapper: ioctl: 4.19.1-ioctl (2011-01-07) initialised: [email protected]
device-mapper: multipath: version 1.2.0 loaded
EXT3-fs (vda3): using internal journal
kjournald starting. Commit interval 5 seconds
EXT3-fs (vda1): using internal journal
EXT3-fs (vda1): mounted filesystem with ordered data mode
Adding 524284k swap on /dev/vda2. Priority:-1 extents:1 across:524284k
Loading iSCSI transport class v2.0-870.
iscsi: registered transport (tcp)
RPC: Registered udp transport module.
RPC: Registered tcp transport module.
RPC: Registered tcp NFSv4.1 backchannel transport module.
scsi2 : iSCSI Initiator over TCP/IP
scsi3 : iSCSI Initiator over TCP/IP
scsi4 : iSCSI Initiator over TCP/IP
scsi5 : iSCSI Initiator over TCP/IP
scsi 2:0:0:0: Direct-Access NETAPP LUN 8010 PQ: 0 ANSI: 5
sd 2:0:0:0: Attached scsi generic sg0 type 0
scsi 4:0:0:0: Direct-Access NETAPP LUN 8010 PQ: 0 ANSI: 5
scsi 3:0:0:0: Direct-Access NETAPP LUN 8010 PQ: 0 ANSI: 5
scsi 5:0:0:0: Direct-Access NETAPP LUN 8010 PQ: 0 ANSI: 5
sd 2:0:0:0: [sda] 20971520 512-byte logical blocks: (10.7 GB/10.0 GiB)
sd 2:0:0:0: [sda] Write Protect is off
sd 2:0:0:0: [sda] Mode Sense: bd 00 00 08
sd 5:0:0:0: Attached scsi generic sg1 type 0
sd 3:0:0:0: Attached scsi generic sg2 type 0
sd 4:0:0:0: Attached scsi generic sg3 type 0
sd 5:0:0:0: [sdb] 20971520 512-byte logical blocks: (10.7 GB/10.0 GiB)
sd 2:0:0:0: [sda] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA
sd 3:0:0:0: [sdc] 20971520 512-byte logical blocks: (10.7 GB/10.0 GiB)
sd 4:0:0:0: [sdd] 20971520 512-byte logical blocks: (10.7 GB/10.0 GiB)
sd 5:0:0:0: [sdb] Write Protect is off
sd 5:0:0:0: [sdb] Mode Sense: bd 00 00 08
sd 5:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA
sd 3:0:0:0: [sdc] Write Protect is off
sd 3:0:0:0: [sdc] Mode Sense: bd 00 00 08
sd 4:0:0:0: [sdd] Write Protect is off
sd 4:0:0:0: [sdd] Mode Sense: bd 00 00 08
sd 3:0:0:0: [sdc] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA
sd 4:0:0:0: [sdd] Write cache: disabled, read cache: enabled, doesn't support DPO or FUA
sda: sda1 sda2
sdb: sdb1 sdb2
sd 2:0:0:0: [sda] Attached SCSI disk
sdc: sdc1 sdc2
sdd: sdd1 sdd2
sd 5:0:0:0: [sdb] Attached SCSI disk
sd 3:0:0:0: [sdc] Attached SCSI disk
sd 4:0:0:0: [sdd] Attached SCSI disk
sd 2:0:0:0: alua: supports implicit TPGS
sd 2:0:0:0: alua: port group 1100 rel port 83ea
sd 2:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 5:0:0:0: alua: supports implicit TPGS
sd 5:0:0:0: alua: port group 1100 rel port 83e9
sd 5:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 3:0:0:0: alua: supports implicit TPGS
sd 3:0:0:0: alua: port group 1100 rel port 83e8
sd 3:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 4:0:0:0: alua: supports implicit TPGS
sd 4:0:0:0: alua: port group 1100 rel port 83eb
sd 4:0:0:0: alua: port group 1100 state A supports TolUsNA
alua: device handler registered
device-mapper: multipath round-robin: version 1.0.0 loaded
sd 5:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 5:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 5:0:0:0: alua: port group 1100 state A supports TolUsNA
sd 5:0:0:0: alua: port group 1100 state A supports TolUsNA
sdb:
EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: (null)
scp used greatest stack depth: 3360 bytes left
vi used greatest stack depth: 3184 bytes left
=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.38-rc6-snitm+ #2
-------------------------------------------------------
ffsb/3110 is trying to acquire lock:
(&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
but task is already holding lock:
(&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&rq->lock){-.-.-.}:
[<ffffffff810731eb>] lock_acquire+0xe3/0x110
[<ffffffff81373773>] _raw_spin_lock+0x36/0x69
[<ffffffff810348f0>] task_rq_lock+0x51/0x83
[<ffffffff810402f2>] try_to_wake_up+0x34/0x220
[<ffffffff810404f0>] default_wake_function+0x12/0x14
[<ffffffff81030136>] __wake_up_common+0x4e/0x84
[<ffffffff810345a1>] complete+0x3f/0x52
[<ffffffff811b9e58>] blk_end_sync_rq+0x34/0x38
[<ffffffff811b6279>] blk_finish_request+0x1f5/0x224
[<ffffffff811b62e8>] __blk_end_request_all+0x40/0x49
[<ffffffffa00165c3>] blk_done+0x92/0xe7 [virtio_blk]
[<ffffffffa0007382>] vring_interrupt+0x68/0x71 [virtio_ring]
[<ffffffffa000e416>] vp_vring_interrupt+0x5b/0x97 [virtio_pci]
[<ffffffffa000e497>] vp_interrupt+0x45/0x4a [virtio_pci]
[<ffffffff81097a80>] handle_IRQ_event+0x57/0x127
[<ffffffff81099bfe>] handle_fasteoi_irq+0x96/0xd9
[<ffffffff8100511b>] handle_irq+0x88/0x91
[<ffffffff8137ab8d>] do_IRQ+0x4d/0xb4
[<ffffffff81374453>] ret_from_intr+0x0/0x1a
[<ffffffff811d4cfe>] __debug_object_init+0x33a/0x377
[<ffffffff811d4d52>] debug_object_init_on_stack+0x17/0x19
[<ffffffff8105195c>] init_timer_on_stack_key+0x26/0x3e
[<ffffffff81371d33>] schedule_timeout+0xa7/0xfe
[<ffffffff81371b14>] wait_for_common+0xd7/0x135
[<ffffffff81371c0b>] wait_for_completion_timeout+0x13/0x15
[<ffffffff811b9fdc>] blk_execute_rq+0xe9/0x12d
[<ffffffffa001609b>] virtblk_serial_show+0x9b/0xdb [virtio_blk]
[<ffffffff81266104>] dev_attr_show+0x27/0x4e
[<ffffffff81159471>] sysfs_read_file+0xbd/0x16b
[<ffffffff811001ac>] vfs_read+0xae/0x10a
[<ffffffff811002d1>] sys_read+0x4d/0x77
[<ffffffff81002b82>] system_call_fastpath+0x16/0x1b
-> #1 (key#28){-.-...}:
[<ffffffff810731eb>] lock_acquire+0xe3/0x110
[<ffffffff813738f7>] _raw_spin_lock_irqsave+0x4e/0x88
[<ffffffff81034583>] complete+0x21/0x52
[<ffffffff811b9e58>] blk_end_sync_rq+0x34/0x38
[<ffffffff811b6279>] blk_finish_request+0x1f5/0x224
[<ffffffff811b6588>] blk_end_bidi_request+0x42/0x5d
[<ffffffff811b65df>] blk_end_request+0x10/0x12
[<ffffffff8127c17b>] scsi_io_completion+0x1b0/0x424
[<ffffffff81275512>] scsi_finish_command+0xe9/0xf2
[<ffffffff8127c503>] scsi_softirq_done+0xff/0x108
[<ffffffff811bab18>] blk_done_softirq+0x84/0x98
[<ffffffff8104a117>] __do_softirq+0xe2/0x1d3
[<ffffffff81003b1c>] call_softirq+0x1c/0x28
[<ffffffff8100503b>] do_softirq+0x4b/0xa3
[<ffffffff81049e71>] irq_exit+0x4a/0x8c
[<ffffffff8137abdd>] do_IRQ+0x9d/0xb4
[<ffffffff81374453>] ret_from_intr+0x0/0x1a
[<ffffffff8137377b>] _raw_spin_lock+0x3e/0x69
[<ffffffff810e9bdc>] __page_lock_anon_vma+0x65/0x9d
[<ffffffff810e9c35>] try_to_unmap_anon+0x21/0xdb
[<ffffffff810e9d1a>] try_to_munlock+0x2b/0x39
[<ffffffff810e3ca6>] munlock_vma_page+0x45/0x7f
[<ffffffff810e1e63>] do_wp_page+0x536/0x580
[<ffffffff810e28b9>] handle_pte_fault+0x6af/0x6e8
[<ffffffff810e29cc>] handle_mm_fault+0xda/0xed
[<ffffffff81377768>] do_page_fault+0x3b4/0x3d6
[<ffffffff81374725>] page_fault+0x25/0x30
-> #0 (&(&q->__queue_lock)->rlock){..-...}:
[<ffffffff81072e14>] __lock_acquire+0xa32/0xd26
[<ffffffff810731eb>] lock_acquire+0xe3/0x110
[<ffffffff81373773>] _raw_spin_lock+0x36/0x69
[<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
[<ffffffff811b4ce0>] __blk_flush_plug+0x1a/0x3a
[<ffffffff81371471>] schedule+0x2ac/0x725
[<ffffffffa00fef16>] start_this_handle+0x3be/0x4b1 [jbd2]
[<ffffffffa00ff1fc>] jbd2__journal_start+0xc2/0xf6 [jbd2]
[<ffffffffa00ff243>] jbd2_journal_start+0x13/0x15 [jbd2]
[<ffffffffa013823c>] ext4_journal_start_sb+0xe1/0x116 [ext4]
[<ffffffffa012748d>] ext4_da_writepages+0x27c/0x517 [ext4]
[<ffffffff810cd298>] do_writepages+0x24/0x30
[<ffffffff8111e625>] writeback_single_inode+0xaf/0x1d0
[<ffffffff8111eb88>] writeback_sb_inodes+0xab/0x134
[<ffffffff8111f542>] writeback_inodes_wb+0x12b/0x13d
[<ffffffff810cc920>] balance_dirty_pages_ratelimited_nr+0x2be/0x3d8
[<ffffffff810c456c>] generic_file_buffered_write+0x1ff/0x267
[<ffffffff810c593f>] __generic_file_aio_write+0x245/0x27a
[<ffffffff810c59d9>] generic_file_aio_write+0x65/0xbc
[<ffffffffa011dd57>] ext4_file_write+0x1f5/0x256 [ext4]
[<ffffffff810ff5b1>] do_sync_write+0xcb/0x108
[<ffffffff810fffaf>] vfs_write+0xb1/0x10d
[<ffffffff811000d4>] sys_write+0x4d/0x77
[<ffffffff81002b82>] system_call_fastpath+0x16/0x1b
other info that might help us debug this:
3 locks held by ffsb/3110:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff810c59bd>] generic_file_aio_write+0x49/0xbc
#1: (&type->s_umount_key#36){.+.+..}, at: [<ffffffff8111f4e5>] writeback_inodes_wb+0xce/0x13d
#2: (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
stack backtrace:
Pid: 3110, comm: ffsb Not tainted 2.6.38-rc6-snitm+ #2
Call Trace:
[<ffffffff810714fa>] ? print_circular_bug+0xae/0xbc
[<ffffffff81072e14>] ? __lock_acquire+0xa32/0xd26
[<ffffffff810731eb>] ? lock_acquire+0xe3/0x110
[<ffffffff811b4c4d>] ? flush_plug_list+0xbc/0x135
[<ffffffff81373773>] ? _raw_spin_lock+0x36/0x69
[<ffffffff811b4c4d>] ? flush_plug_list+0xbc/0x135
[<ffffffff811b4c4d>] ? flush_plug_list+0xbc/0x135
[<ffffffff811b4ce0>] ? __blk_flush_plug+0x1a/0x3a
[<ffffffff81371471>] ? schedule+0x2ac/0x725
[<ffffffff810700f3>] ? trace_hardirqs_off+0xd/0xf
[<ffffffffa00fef16>] ? start_this_handle+0x3be/0x4b1 [jbd2]
[<ffffffff8106001a>] ? autoremove_wake_function+0x0/0x3d
[<ffffffffa00ff1fc>] ? jbd2__journal_start+0xc2/0xf6 [jbd2]
[<ffffffffa00ff243>] ? jbd2_journal_start+0x13/0x15 [jbd2]
[<ffffffffa013823c>] ? ext4_journal_start_sb+0xe1/0x116 [ext4]
[<ffffffffa0120d2f>] ? ext4_meta_trans_blocks+0x67/0xb8 [ext4]
[<ffffffffa012748d>] ? ext4_da_writepages+0x27c/0x517 [ext4]
[<ffffffff810658fd>] ? sched_clock_local+0x1c/0x82
[<ffffffff810cd298>] ? do_writepages+0x24/0x30
[<ffffffff8111e625>] ? writeback_single_inode+0xaf/0x1d0
[<ffffffff8111eb88>] ? writeback_sb_inodes+0xab/0x134
[<ffffffff8111f542>] ? writeback_inodes_wb+0x12b/0x13d
[<ffffffff810cc920>] ? balance_dirty_pages_ratelimited_nr+0x2be/0x3d8
[<ffffffff810c412d>] ? iov_iter_copy_from_user_atomic+0x81/0xf1
[<ffffffff810c456c>] ? generic_file_buffered_write+0x1ff/0x267
[<ffffffff81048adf>] ? current_fs_time+0x27/0x2e
[<ffffffff810c593f>] ? __generic_file_aio_write+0x245/0x27a
[<ffffffff810658fd>] ? sched_clock_local+0x1c/0x82
[<ffffffff810c59d9>] ? generic_file_aio_write+0x65/0xbc
[<ffffffffa011dd57>] ? ext4_file_write+0x1f5/0x256 [ext4]
[<ffffffff81070983>] ? mark_lock+0x2d/0x22d
[<ffffffff8107279e>] ? __lock_acquire+0x3bc/0xd26
[<ffffffff810ff5b1>] ? do_sync_write+0xcb/0x108
[<ffffffff810700f3>] ? trace_hardirqs_off+0xd/0xf
[<ffffffff81065a72>] ? local_clock+0x41/0x5a
[<ffffffff8118e62f>] ? security_file_permission+0x2e/0x33
[<ffffffff810fffaf>] ? vfs_write+0xb1/0x10d
[<ffffffff81100724>] ? fget_light+0x57/0xf0
[<ffffffff81070e61>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff811000d4>] ? sys_write+0x4d/0x77
[<ffffffff81002b82>] ? system_call_fastpath+0x16/0x1b
2011/3/4 Mike Snitzer <[email protected]>:
> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> kernel, when I try an fsync heavy workload to a request-based mpath
> device (the kernel ultimately goes down in flames, I've yet to look at
> the crashdump I took)
>
>
> =======================================================
> [ INFO: possible circular locking dependency detected ]
> 2.6.38-rc6-snitm+ #2
> -------------------------------------------------------
> ffsb/3110 is trying to acquire lock:
> ?(&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>
> but task is already holding lock:
> ?(&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>
> which lock already depends on the new lock.
I hit this too. Can you check if attached debug patch fixes it?
Thanks,
Shaohua
On 2011-03-04 14:02, Shaohua Li wrote:
> 2011/3/4 Mike Snitzer <[email protected]>:
>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>> kernel, when I try an fsync heavy workload to a request-based mpath
>> device (the kernel ultimately goes down in flames, I've yet to look at
>> the crashdump I took)
>>
>>
>> =======================================================
>> [ INFO: possible circular locking dependency detected ]
>> 2.6.38-rc6-snitm+ #2
>> -------------------------------------------------------
>> ffsb/3110 is trying to acquire lock:
>> (&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>>
>> but task is already holding lock:
>> (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>>
>> which lock already depends on the new lock.
> I hit this too. Can you check if attached debug patch fixes it?
I'll take a look at this. It would be really nice if we could move the
plug flush outside of the runqueue lock.
--
Jens Axboe
On Fri, Mar 04 2011 at 8:02am -0500,
Shaohua Li <[email protected]> wrote:
> 2011/3/4 Mike Snitzer <[email protected]>:
> > I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> > kernel, when I try an fsync heavy workload to a request-based mpath
> > device (the kernel ultimately goes down in flames, I've yet to look at
> > the crashdump I took)
> >
> >
> > =======================================================
> > [ INFO: possible circular locking dependency detected ]
> > 2.6.38-rc6-snitm+ #2
> > -------------------------------------------------------
> > ffsb/3110 is trying to acquire lock:
> > ?(&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
> >
> > but task is already holding lock:
> > ?(&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
> >
> > which lock already depends on the new lock.
> I hit this too. Can you check if attached debug patch fixes it?
Fixes it for me.
Thanks,
Mike
On 2011-03-04 22:43, Mike Snitzer wrote:
> On Fri, Mar 04 2011 at 8:02am -0500,
> Shaohua Li <[email protected]> wrote:
>
>> 2011/3/4 Mike Snitzer <[email protected]>:
>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>> the crashdump I took)
>>>
>>>
>>> =======================================================
>>> [ INFO: possible circular locking dependency detected ]
>>> 2.6.38-rc6-snitm+ #2
>>> -------------------------------------------------------
>>> ffsb/3110 is trying to acquire lock:
>>> (&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>>>
>>> but task is already holding lock:
>>> (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>>>
>>> which lock already depends on the new lock.
>> I hit this too. Can you check if attached debug patch fixes it?
>
> Fixes it for me.
The preempt bit in block/ should not be needed. Can you check whether
it's the moving of the flush in sched.c that does the trick?
The problem with the current spot is that it's under the runqueue lock.
The problem with the modified variant is that we flush even if the task
is not going to sleep. We really just want to flush when it is going to
move out of the runqueue, but we want to do that outside of the runqueue
lock as well.
--
Jens Axboe
On Fri, Mar 04 2011 at 4:50pm -0500,
Jens Axboe <[email protected]> wrote:
> On 2011-03-04 22:43, Mike Snitzer wrote:
> > On Fri, Mar 04 2011 at 8:02am -0500,
> > Shaohua Li <[email protected]> wrote:
> >
> >> 2011/3/4 Mike Snitzer <[email protected]>:
> >>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> >>> kernel, when I try an fsync heavy workload to a request-based mpath
> >>> device (the kernel ultimately goes down in flames, I've yet to look at
> >>> the crashdump I took)
> >>>
> >>>
> >>> =======================================================
> >>> [ INFO: possible circular locking dependency detected ]
> >>> 2.6.38-rc6-snitm+ #2
> >>> -------------------------------------------------------
> >>> ffsb/3110 is trying to acquire lock:
> >>> (&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
> >>>
> >>> but task is already holding lock:
> >>> (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
> >>>
> >>> which lock already depends on the new lock.
> >> I hit this too. Can you check if attached debug patch fixes it?
> >
> > Fixes it for me.
>
> The preempt bit in block/ should not be needed. Can you check whether
> it's the moving of the flush in sched.c that does the trick?
It works if I leave out the blk-core.c preempt change too.
> The problem with the current spot is that it's under the runqueue lock.
> The problem with the modified variant is that we flush even if the task
> is not going to sleep. We really just want to flush when it is going to
> move out of the runqueue, but we want to do that outside of the runqueue
> lock as well.
OK. So we still need a proper fix for this issue.
On 2011-03-04 23:27, Mike Snitzer wrote:
> On Fri, Mar 04 2011 at 4:50pm -0500,
> Jens Axboe <[email protected]> wrote:
>
>> On 2011-03-04 22:43, Mike Snitzer wrote:
>>> On Fri, Mar 04 2011 at 8:02am -0500,
>>> Shaohua Li <[email protected]> wrote:
>>>
>>>> 2011/3/4 Mike Snitzer <[email protected]>:
>>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>>> the crashdump I took)
>>>>>
>>>>>
>>>>> =======================================================
>>>>> [ INFO: possible circular locking dependency detected ]
>>>>> 2.6.38-rc6-snitm+ #2
>>>>> -------------------------------------------------------
>>>>> ffsb/3110 is trying to acquire lock:
>>>>> (&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>>>>>
>>>>> but task is already holding lock:
>>>>> (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>>>>>
>>>>> which lock already depends on the new lock.
>>>> I hit this too. Can you check if attached debug patch fixes it?
>>>
>>> Fixes it for me.
>>
>> The preempt bit in block/ should not be needed. Can you check whether
>> it's the moving of the flush in sched.c that does the trick?
>
> It works if I leave out the blk-core.c preempt change too.
>
>> The problem with the current spot is that it's under the runqueue lock.
>> The problem with the modified variant is that we flush even if the task
>> is not going to sleep. We really just want to flush when it is going to
>> move out of the runqueue, but we want to do that outside of the runqueue
>> lock as well.
>
> OK. So we still need a proper fix for this issue.
Apparently so. Peter/Ingo, please shoot this one down in flames.
Summary:
- Need a way to trigger this flushing when a task is going to sleep
- It's currently done right before calling deactivate_task(). We know
the task is going to sleep here, but it's also under the runqueue
lock. Not good.
- In the new location, it's not completely clear to me whether we can
safely deref 'prev' or not. The usage of prev_state would seem to
indicate that we cannot, and as far as I can tell, prev could at this
point already potentially be running on another CPU.
Help? Peter, we talked about this in Tokyo in September. Initial
suggestion was to use preempt notifiers, which we can't because:
- runqueue lock is also held
- It's not unconditionally available, depends on config.
diff --git a/kernel/sched.c b/kernel/sched.c
index e806446..8581ad3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2826,6 +2826,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
+ /*
+ * If this task has IO plugged, make sure it
+ * gets flushed out to the devices before we go
+ * to sleep
+ */
+ if (prev_state != TASK_RUNNING)
+ blk_flush_plug(prev);
+
fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
@@ -3973,14 +3981,6 @@ need_resched_nonpreemptible:
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
- /*
- * If this task has IO plugged, make sure it
- * gets flushed out to the devices before we go
- * to sleep
- */
- blk_flush_plug(prev);
- BUG_ON(prev->plug && !list_empty(&prev->plug->list));
-
deactivate_task(rq, prev, DEQUEUE_SLEEP);
}
switch_count = &prev->nvcsw;
--
Jens Axboe
2011/3/5 Jens Axboe <[email protected]>:
> On 2011-03-04 22:43, Mike Snitzer wrote:
>> On Fri, Mar 04 2011 at ?8:02am -0500,
>> Shaohua Li <[email protected]> wrote:
>>
>>> 2011/3/4 Mike Snitzer <[email protected]>:
>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>> the crashdump I took)
>>>>
>>>>
>>>> =======================================================
>>>> [ INFO: possible circular locking dependency detected ]
>>>> 2.6.38-rc6-snitm+ #2
>>>> -------------------------------------------------------
>>>> ffsb/3110 is trying to acquire lock:
>>>> ?(&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>>>>
>>>> but task is already holding lock:
>>>> ?(&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>>>>
>>>> which lock already depends on the new lock.
>>> I hit this too. Can you check if attached debug patch fixes it?
>>
>> Fixes it for me.
>
> The preempt bit in block/ should not be needed. Can you check whether
> it's the moving of the flush in sched.c that does the trick?
yes, it's not related to the lockdep issue. but I think we still need
it. if there is a preempt between attempt_plub_merge(), we do queue
flush, then we might hit an incomplete list of request->biotail. Am I
missing anything?
Thanks,
Shaohua
On 2011-03-07 01:54, Shaohua Li wrote:
> 2011/3/5 Jens Axboe <[email protected]>:
>> On 2011-03-04 22:43, Mike Snitzer wrote:
>>> On Fri, Mar 04 2011 at 8:02am -0500,
>>> Shaohua Li <[email protected]> wrote:
>>>
>>>> 2011/3/4 Mike Snitzer <[email protected]>:
>>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>>> the crashdump I took)
>>>>>
>>>>>
>>>>> =======================================================
>>>>> [ INFO: possible circular locking dependency detected ]
>>>>> 2.6.38-rc6-snitm+ #2
>>>>> -------------------------------------------------------
>>>>> ffsb/3110 is trying to acquire lock:
>>>>> (&(&q->__queue_lock)->rlock){..-...}, at: [<ffffffff811b4c4d>] flush_plug_list+0xbc/0x135
>>>>>
>>>>> but task is already holding lock:
>>>>> (&rq->lock){-.-.-.}, at: [<ffffffff8137132f>] schedule+0x16a/0x725
>>>>>
>>>>> which lock already depends on the new lock.
>>>> I hit this too. Can you check if attached debug patch fixes it?
>>>
>>> Fixes it for me.
>>
>> The preempt bit in block/ should not be needed. Can you check whether
>> it's the moving of the flush in sched.c that does the trick?
> yes, it's not related to the lockdep issue. but I think we still need
> it. if there is a preempt between attempt_plub_merge(), we do queue
> flush, then we might hit an incomplete list of request->biotail. Am I
> missing anything?
Ah, so it is needed with the other fix you proposed, since we do flush
on preempt then. If we only do the flush on going to sleep, then we
don't need that preemption disable in that section.
--
Jens Axboe
On Sat, 2011-03-05 at 21:54 +0100, Jens Axboe wrote:
>
> Apparently so. Peter/Ingo, please shoot this one down in flames.
> Summary:
>
> - Need a way to trigger this flushing when a task is going to sleep
> - It's currently done right before calling deactivate_task(). We know
> the task is going to sleep here, but it's also under the runqueue
> lock. Not good.
> - In the new location, it's not completely clear to me whether we can
> safely deref 'prev' or not. The usage of prev_state would seem to
> indicate that we cannot, and as far as I can tell, prev could at this
> point already potentially be running on another CPU.
>
> Help? Peter, we talked about this in Tokyo in September. Initial
> suggestion was to use preempt notifiers, which we can't because:
>
> - runqueue lock is also held
> - It's not unconditionally available, depends on config.
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index e806446..8581ad3 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -2826,6 +2826,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
> #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
> finish_lock_switch(rq, prev);
>
> + /*
> + * If this task has IO plugged, make sure it
> + * gets flushed out to the devices before we go
> + * to sleep
> + */
> + if (prev_state != TASK_RUNNING)
> + blk_flush_plug(prev);
> +
> fire_sched_in_preempt_notifiers(current);
> if (mm)
> mmdrop(mm);
> @@ -3973,14 +3981,6 @@ need_resched_nonpreemptible:
> if (to_wakeup)
> try_to_wake_up_local(to_wakeup);
> }
> - /*
> - * If this task has IO plugged, make sure it
> - * gets flushed out to the devices before we go
> - * to sleep
> - */
> - blk_flush_plug(prev);
> - BUG_ON(prev->plug && !list_empty(&prev->plug->list));
> -
> deactivate_task(rq, prev, DEQUEUE_SLEEP);
> }
> switch_count = &prev->nvcsw;
>
Right, so your new location is still under rq->lock for a number of
architectures (including x86). finish_lock_switch() doesn't actually
release the lock unless __ARCH_WANT_INTERRUPTS_ON_CTXSW ||
__ARCH_WANT_UNLOCKED_CTXSW (the former implies the latter since rq->lock
is IRQ-safe).
If you want a safe place to drop rq->lock (but keep in mind to keep IRQs
disabled there) and use prev, do something like the below. Both
pre_schedule() and idle_balance() can already drop the rq->lock do doing
it once more is quite all-right ;-)
Note that once you drop rq->lock prev->state can change to TASK_RUNNING
again so don't re-check that.
---
kernel/sched.c | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 655164e..99c5637 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4120,8 +4120,12 @@ need_resched_nonpreemptible:
switch_count = &prev->nvcsw;
}
+ if (prev->state != TASK_RUNNING) {
+ raw_spin_unlock(&rq->lock);
+ blk_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
pre_schedule(rq, prev);
-
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
On 2011-03-07 11:23, Peter Zijlstra wrote:
> On Sat, 2011-03-05 at 21:54 +0100, Jens Axboe wrote:
>>
>> Apparently so. Peter/Ingo, please shoot this one down in flames.
>> Summary:
>>
>> - Need a way to trigger this flushing when a task is going to sleep
>> - It's currently done right before calling deactivate_task(). We know
>> the task is going to sleep here, but it's also under the runqueue
>> lock. Not good.
>> - In the new location, it's not completely clear to me whether we can
>> safely deref 'prev' or not. The usage of prev_state would seem to
>> indicate that we cannot, and as far as I can tell, prev could at this
>> point already potentially be running on another CPU.
>>
>> Help? Peter, we talked about this in Tokyo in September. Initial
>> suggestion was to use preempt notifiers, which we can't because:
>>
>> - runqueue lock is also held
>> - It's not unconditionally available, depends on config.
>>
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index e806446..8581ad3 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -2826,6 +2826,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
>> #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
>> finish_lock_switch(rq, prev);
>>
>> + /*
>> + * If this task has IO plugged, make sure it
>> + * gets flushed out to the devices before we go
>> + * to sleep
>> + */
>> + if (prev_state != TASK_RUNNING)
>> + blk_flush_plug(prev);
>> +
>> fire_sched_in_preempt_notifiers(current);
>> if (mm)
>> mmdrop(mm);
>> @@ -3973,14 +3981,6 @@ need_resched_nonpreemptible:
>> if (to_wakeup)
>> try_to_wake_up_local(to_wakeup);
>> }
>> - /*
>> - * If this task has IO plugged, make sure it
>> - * gets flushed out to the devices before we go
>> - * to sleep
>> - */
>> - blk_flush_plug(prev);
>> - BUG_ON(prev->plug && !list_empty(&prev->plug->list));
>> -
>> deactivate_task(rq, prev, DEQUEUE_SLEEP);
>> }
>> switch_count = &prev->nvcsw;
>>
>
> Right, so your new location is still under rq->lock for a number of
> architectures (including x86). finish_lock_switch() doesn't actually
> release the lock unless __ARCH_WANT_INTERRUPTS_ON_CTXSW ||
> __ARCH_WANT_UNLOCKED_CTXSW (the former implies the latter since rq->lock
> is IRQ-safe).
Ah, thanks for that.
> If you want a safe place to drop rq->lock (but keep in mind to keep IRQs
> disabled there) and use prev, do something like the below. Both
> pre_schedule() and idle_balance() can already drop the rq->lock do doing
> it once more is quite all-right ;-)
>
> Note that once you drop rq->lock prev->state can change to TASK_RUNNING
> again so don't re-check that.
So that's a problem. If I end up flushing this structure that sits on
the stack of the process, I cannot have it running on another CPU at
that time.
I need the process to be in such a state that it will not get scheduled
on another CPU before this has completed.
Is that even possible? If not, then I think the best solution is to
flush on preempt as well and hence move it up a bit like Shaohua posted
as well. This is also how it was originally done, but I wanted to avoid
that if at all possible.
--
Jens Axboe
On Mon, 2011-03-07 at 20:43 +0100, Jens Axboe wrote:
> On 2011-03-07 11:23, Peter Zijlstra wrote:
> > On Sat, 2011-03-05 at 21:54 +0100, Jens Axboe wrote:
> >>
> >> Apparently so. Peter/Ingo, please shoot this one down in flames.
> >> Summary:
> >>
> >> - Need a way to trigger this flushing when a task is going to sleep
> >> - It's currently done right before calling deactivate_task(). We know
> >> the task is going to sleep here, but it's also under the runqueue
> >> lock. Not good.
> >> - In the new location, it's not completely clear to me whether we can
> >> safely deref 'prev' or not. The usage of prev_state would seem to
> >> indicate that we cannot, and as far as I can tell, prev could at this
> >> point already potentially be running on another CPU.
> >>
> >> Help? Peter, we talked about this in Tokyo in September. Initial
> >> suggestion was to use preempt notifiers, which we can't because:
> >>
> >> - runqueue lock is also held
> >> - It's not unconditionally available, depends on config.
> >>
> >> diff --git a/kernel/sched.c b/kernel/sched.c
> >> index e806446..8581ad3 100644
> >> --- a/kernel/sched.c
> >> +++ b/kernel/sched.c
> >> @@ -2826,6 +2826,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
> >> #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
> >> finish_lock_switch(rq, prev);
> >>
> >> + /*
> >> + * If this task has IO plugged, make sure it
> >> + * gets flushed out to the devices before we go
> >> + * to sleep
> >> + */
> >> + if (prev_state != TASK_RUNNING)
> >> + blk_flush_plug(prev);
> >> +
> >> fire_sched_in_preempt_notifiers(current);
> >> if (mm)
> >> mmdrop(mm);
> >> @@ -3973,14 +3981,6 @@ need_resched_nonpreemptible:
> >> if (to_wakeup)
> >> try_to_wake_up_local(to_wakeup);
> >> }
> >> - /*
> >> - * If this task has IO plugged, make sure it
> >> - * gets flushed out to the devices before we go
> >> - * to sleep
> >> - */
> >> - blk_flush_plug(prev);
> >> - BUG_ON(prev->plug && !list_empty(&prev->plug->list));
> >> -
> >> deactivate_task(rq, prev, DEQUEUE_SLEEP);
> >> }
> >> switch_count = &prev->nvcsw;
> >>
> >
> > Right, so your new location is still under rq->lock for a number of
> > architectures (including x86). finish_lock_switch() doesn't actually
> > release the lock unless __ARCH_WANT_INTERRUPTS_ON_CTXSW ||
> > __ARCH_WANT_UNLOCKED_CTXSW (the former implies the latter since rq->lock
> > is IRQ-safe).
>
> Ah, thanks for that.
>
> > If you want a safe place to drop rq->lock (but keep in mind to keep IRQs
> > disabled there) and use prev, do something like the below. Both
> > pre_schedule() and idle_balance() can already drop the rq->lock do doing
> > it once more is quite all-right ;-)
> >
> > Note that once you drop rq->lock prev->state can change to TASK_RUNNING
> > again so don't re-check that.
>
> So that's a problem. If I end up flushing this structure that sits on
> the stack of the process, I cannot have it running on another CPU at
> that time.
>
> I need the process to be in such a state that it will not get scheduled
> on another CPU before this has completed.
>
> Is that even possible?
Yes, if prev will be flipped back to TASK_RUNNING it will still stay on
that cpu, it will not migrate until the cpu that schedules it away (the
cpu you're on) will have flipped rq->curr, and that happens way after
this point. So you're good to go, just don't rely on ->state once you
release rq->lock.
On 2011-03-07 21:41, Peter Zijlstra wrote:
> On Mon, 2011-03-07 at 20:43 +0100, Jens Axboe wrote:
>> On 2011-03-07 11:23, Peter Zijlstra wrote:
>>> On Sat, 2011-03-05 at 21:54 +0100, Jens Axboe wrote:
>>>>
>>>> Apparently so. Peter/Ingo, please shoot this one down in flames.
>>>> Summary:
>>>>
>>>> - Need a way to trigger this flushing when a task is going to sleep
>>>> - It's currently done right before calling deactivate_task(). We know
>>>> the task is going to sleep here, but it's also under the runqueue
>>>> lock. Not good.
>>>> - In the new location, it's not completely clear to me whether we can
>>>> safely deref 'prev' or not. The usage of prev_state would seem to
>>>> indicate that we cannot, and as far as I can tell, prev could at this
>>>> point already potentially be running on another CPU.
>>>>
>>>> Help? Peter, we talked about this in Tokyo in September. Initial
>>>> suggestion was to use preempt notifiers, which we can't because:
>>>>
>>>> - runqueue lock is also held
>>>> - It's not unconditionally available, depends on config.
>>>>
>>>> diff --git a/kernel/sched.c b/kernel/sched.c
>>>> index e806446..8581ad3 100644
>>>> --- a/kernel/sched.c
>>>> +++ b/kernel/sched.c
>>>> @@ -2826,6 +2826,14 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
>>>> #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
>>>> finish_lock_switch(rq, prev);
>>>>
>>>> + /*
>>>> + * If this task has IO plugged, make sure it
>>>> + * gets flushed out to the devices before we go
>>>> + * to sleep
>>>> + */
>>>> + if (prev_state != TASK_RUNNING)
>>>> + blk_flush_plug(prev);
>>>> +
>>>> fire_sched_in_preempt_notifiers(current);
>>>> if (mm)
>>>> mmdrop(mm);
>>>> @@ -3973,14 +3981,6 @@ need_resched_nonpreemptible:
>>>> if (to_wakeup)
>>>> try_to_wake_up_local(to_wakeup);
>>>> }
>>>> - /*
>>>> - * If this task has IO plugged, make sure it
>>>> - * gets flushed out to the devices before we go
>>>> - * to sleep
>>>> - */
>>>> - blk_flush_plug(prev);
>>>> - BUG_ON(prev->plug && !list_empty(&prev->plug->list));
>>>> -
>>>> deactivate_task(rq, prev, DEQUEUE_SLEEP);
>>>> }
>>>> switch_count = &prev->nvcsw;
>>>>
>>>
>>> Right, so your new location is still under rq->lock for a number of
>>> architectures (including x86). finish_lock_switch() doesn't actually
>>> release the lock unless __ARCH_WANT_INTERRUPTS_ON_CTXSW ||
>>> __ARCH_WANT_UNLOCKED_CTXSW (the former implies the latter since rq->lock
>>> is IRQ-safe).
>>
>> Ah, thanks for that.
>>
>>> If you want a safe place to drop rq->lock (but keep in mind to keep IRQs
>>> disabled there) and use prev, do something like the below. Both
>>> pre_schedule() and idle_balance() can already drop the rq->lock do doing
>>> it once more is quite all-right ;-)
>>>
>>> Note that once you drop rq->lock prev->state can change to TASK_RUNNING
>>> again so don't re-check that.
>>
>> So that's a problem. If I end up flushing this structure that sits on
>> the stack of the process, I cannot have it running on another CPU at
>> that time.
>>
>> I need the process to be in such a state that it will not get scheduled
>> on another CPU before this has completed.
>>
>> Is that even possible?
>
> Yes, if prev will be flipped back to TASK_RUNNING it will still stay on
> that cpu, it will not migrate until the cpu that schedules it away (the
> cpu you're on) will have flipped rq->curr, and that happens way after
> this point. So you're good to go, just don't rely on ->state once you
> release rq->lock.
Great, that'll work for me! Your patch should work as-is, then. Thanks
Peter.
--
Jens Axboe
On Mon, 2011-03-07 at 21:46 +0100, Jens Axboe wrote:
>
> Great, that'll work for me! Your patch should work as-is, then. Thanks
> Peter.
Well I think it would be good to write it like:
if (prev->state != TASK_RUNNING && blkneeds_flush(prev)) {
raw_spin_unlock(&rq->lock);
blk_flush_plug(prev);
raw_spin_lock(&rq->lock);
}
To avoid flipping that lock when we don't have to.
On 2011-03-08 10:38, Peter Zijlstra wrote:
> On Mon, 2011-03-07 at 21:46 +0100, Jens Axboe wrote:
>>
>> Great, that'll work for me! Your patch should work as-is, then. Thanks
>> Peter.
>
> Well I think it would be good to write it like:
>
> if (prev->state != TASK_RUNNING && blkneeds_flush(prev)) {
> raw_spin_unlock(&rq->lock);
> blk_flush_plug(prev);
> raw_spin_lock(&rq->lock);
> }
>
> To avoid flipping that lock when we don't have to.
Yes good point, in any case the need to flush will be an unlikely event.
So saving the lock/unlock dance for when we really need it is a good
optimization.
--
Jens Axboe
On 2011-03-03 22:23, Mike Snitzer wrote:
>> diff --git a/block/blk-flush.c b/block/blk-flush.c
>> index 54b123d..c0a07aa 100644
>> --- a/block/blk-flush.c
>> +++ b/block/blk-flush.c
>> @@ -59,7 +59,6 @@ static struct request *blk_flush_complete_seq(struct request_queue *q,
>> static void blk_flush_complete_seq_end_io(struct request_queue *q,
>> unsigned seq, int error)
>> {
>> - bool was_empty = elv_queue_empty(q);
>> struct request *next_rq;
>>
>> next_rq = blk_flush_complete_seq(q, seq, error);
>> @@ -68,7 +67,7 @@ static void blk_flush_complete_seq_end_io(struct request_queue *q,
>> * Moving a request silently to empty queue_head may stall the
>> * queue. Kick the queue in those cases.
>> */
>> - if (was_empty && next_rq)
>> + if (next_rq)
>> __blk_run_queue(q);
>> }
>>
> ...
>> diff --git a/block/elevator.c b/block/elevator.c
>> index a9fe237..d5d17a4 100644
>> --- a/block/elevator.c
>> +++ b/block/elevator.c
>> @@ -619,8 +619,6 @@ void elv_quiesce_end(struct request_queue *q)
> ...
>> -int elv_queue_empty(struct request_queue *q)
>> -{
>> - struct elevator_queue *e = q->elevator;
>> -
>> - if (!list_empty(&q->queue_head))
>> - return 0;
>> -
>> - if (e->ops->elevator_queue_empty_fn)
>> - return e->ops->elevator_queue_empty_fn(q);
>> -
>> - return 1;
>> -}
>> -EXPORT_SYMBOL(elv_queue_empty);
>> -
>
> Your latest 'for-2.6.39/stack-unplug' rebase (commit 7703acb01e)
> misses removing a call to elv_queue_empty() in
> block/blk-flush.c:flush_data_end_io()
>
> CC block/blk-flush.o
> block/blk-flush.c: In function ?flush_data_end_io?:
> block/blk-flush.c:266: error: implicit declaration of function ?elv_queue_empty?
Thanks, also fixed now.
--
Jens Axboe
On 2011-03-03 23:13, Mike Snitzer wrote:
> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> kernel, when I try an fsync heavy workload to a request-based mpath
> device (the kernel ultimately goes down in flames, I've yet to look at
> the crashdump I took)
Mike, can you re-run with the current stack-plug branch? I've fixed the
!CONFIG_BLOCK and rebase issues, and also added a change for this flush
on schedule event. It's run outside of the runqueue lock now, so
hopefully that should solve this one.
--
Jens Axboe
On Tue, Mar 08 2011 at 7:16am -0500,
Jens Axboe <[email protected]> wrote:
> On 2011-03-03 23:13, Mike Snitzer wrote:
> > I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> > kernel, when I try an fsync heavy workload to a request-based mpath
> > device (the kernel ultimately goes down in flames, I've yet to look at
> > the crashdump I took)
>
> Mike, can you re-run with the current stack-plug branch? I've fixed the
> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
> on schedule event. It's run outside of the runqueue lock now, so
> hopefully that should solve this one.
Works for me, thanks.
Mike
On 2011-03-08 21:21, Mike Snitzer wrote:
> On Tue, Mar 08 2011 at 7:16am -0500,
> Jens Axboe <[email protected]> wrote:
>
>> On 2011-03-03 23:13, Mike Snitzer wrote:
>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>> the crashdump I took)
>>
>> Mike, can you re-run with the current stack-plug branch? I've fixed the
>> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
>> on schedule event. It's run outside of the runqueue lock now, so
>> hopefully that should solve this one.
>
> Works for me, thanks.
Super, thanks! Out of curiousity, did you use dm/md?
--
Jens Axboe
Jens Axboe <[email protected]> writes:
> On 2011-03-08 21:21, Mike Snitzer wrote:
>> On Tue, Mar 08 2011 at 7:16am -0500,
>> Jens Axboe <[email protected]> wrote:
>>
>>> On 2011-03-03 23:13, Mike Snitzer wrote:
>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>> the crashdump I took)
>>>
>>> Mike, can you re-run with the current stack-plug branch? I've fixed the
>>> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
>>> on schedule event. It's run outside of the runqueue lock now, so
>>> hopefully that should solve this one.
>>
>> Works for me, thanks.
>
> Super, thanks! Out of curiousity, did you use dm/md?
mm/memory-failure.c: In function 'hwpoison_user_mappings':
mm/memory-failure.c:948: error: implicit declaration of function 'lock_page_nosync'
You missed a conversion of lock_page_nosync -> lock_page.
Cheers,
Jeff
On Tue, Mar 08 2011 at 3:27pm -0500,
Jens Axboe <[email protected]> wrote:
> On 2011-03-08 21:21, Mike Snitzer wrote:
> > On Tue, Mar 08 2011 at 7:16am -0500,
> > Jens Axboe <[email protected]> wrote:
> >
> >> On 2011-03-03 23:13, Mike Snitzer wrote:
> >>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> >>> kernel, when I try an fsync heavy workload to a request-based mpath
> >>> device (the kernel ultimately goes down in flames, I've yet to look at
> >>> the crashdump I took)
> >>
> >> Mike, can you re-run with the current stack-plug branch? I've fixed the
> >> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
> >> on schedule event. It's run outside of the runqueue lock now, so
> >> hopefully that should solve this one.
> >
> > Works for me, thanks.
>
> Super, thanks! Out of curiousity, did you use dm/md?
Yes, I've been using a request-based DM multipath device.
On 2011-03-08 22:36, Jeff Moyer wrote:
> Jens Axboe <[email protected]> writes:
>
>> On 2011-03-08 21:21, Mike Snitzer wrote:
>>> On Tue, Mar 08 2011 at 7:16am -0500,
>>> Jens Axboe <[email protected]> wrote:
>>>
>>>> On 2011-03-03 23:13, Mike Snitzer wrote:
>>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>>> the crashdump I took)
>>>>
>>>> Mike, can you re-run with the current stack-plug branch? I've fixed the
>>>> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
>>>> on schedule event. It's run outside of the runqueue lock now, so
>>>> hopefully that should solve this one.
>>>
>>> Works for me, thanks.
>>
>> Super, thanks! Out of curiousity, did you use dm/md?
>
> mm/memory-failure.c: In function 'hwpoison_user_mappings':
> mm/memory-failure.c:948: error: implicit declaration of function 'lock_page_nosync'
>
> You missed a conversion of lock_page_nosync -> lock_page.
Thanks Jeff, I guess I should run a full modconfig/yesconfig build again
just to check that everyone is still uptodate.
--
Jens Axboe
On Tue, Mar 08 2011 at 5:05pm -0500,
Mike Snitzer <[email protected]> wrote:
> On Tue, Mar 08 2011 at 3:27pm -0500,
> Jens Axboe <[email protected]> wrote:
>
> > On 2011-03-08 21:21, Mike Snitzer wrote:
> > > On Tue, Mar 08 2011 at 7:16am -0500,
> > > Jens Axboe <[email protected]> wrote:
> > >
> > >> On 2011-03-03 23:13, Mike Snitzer wrote:
> > >>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> > >>> kernel, when I try an fsync heavy workload to a request-based mpath
> > >>> device (the kernel ultimately goes down in flames, I've yet to look at
> > >>> the crashdump I took)
> > >>
> > >> Mike, can you re-run with the current stack-plug branch? I've fixed the
> > >> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
> > >> on schedule event. It's run outside of the runqueue lock now, so
> > >> hopefully that should solve this one.
> > >
> > > Works for me, thanks.
> >
> > Super, thanks! Out of curiousity, did you use dm/md?
>
> Yes, I've been using a request-based DM multipath device.
Hi Jens,
I just got to reviewing your onstack plugging DM changes (I looked at
the core block layer changes for additional context and also had a brief
look at MD).
I need to put more time to the review of all this code but one thing
that is immediately apparent is that after these changes DM only has one
onstack plug/unplug -- in drivers/md/dm-kcopyd.c:do_work()
You've removed a considerable amount of implicit plug/explicit unplug
code from DM (and obviously elsewhere but I have my DM hat on ;).
First question: is relying on higher-level (aio, fs, read-ahead)
explicit plugging/unplugging sufficient? Seems odd to not have the
control/need to unplug the DM device upon resume (after a suspend).
(this naive question/concern stems from me needing to understand the
core block layer's onstack plugging changes better)
(but if those higher-level explicit onstack plug changes make all this
code removal possible shouldn't those commits come before changing
underlying block drivers like DM, MD, etc?)
I noticed that driver/md/dm-raid1.c:do_mirror() seems to follow the same
pattern of drivers/md/dm-kcopyd.c:do_work().. so rather than remove
dm_table_unplug_all() shouldn't it be replaced with a
blk_start_plug/blk_finish_plug?
Also, in your MD changes, you removed all calls to md_unplug() but
didn't remove md_unplug(). Seems it should be removed along with the
'plug' member of 'struct mddev_t'? Neil?
Thanks,
Mike
On Tue, Mar 08 2011 at 5:05pm -0500,
Mike Snitzer <[email protected]> wrote:
> On Tue, Mar 08 2011 at 3:27pm -0500,
> Jens Axboe <[email protected]> wrote:
>
> > On 2011-03-08 21:21, Mike Snitzer wrote:
> > > On Tue, Mar 08 2011 at 7:16am -0500,
> > > Jens Axboe <[email protected]> wrote:
> > >
> > >> On 2011-03-03 23:13, Mike Snitzer wrote:
> > >>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> > >>> kernel, when I try an fsync heavy workload to a request-based mpath
> > >>> device (the kernel ultimately goes down in flames, I've yet to look at
> > >>> the crashdump I took)
> > >>
> > >> Mike, can you re-run with the current stack-plug branch? I've fixed the
> > >> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
> > >> on schedule event. It's run outside of the runqueue lock now, so
> > >> hopefully that should solve this one.
> > >
> > > Works for me, thanks.
> >
> > Super, thanks! Out of curiousity, did you use dm/md?
>
> Yes, I've been using a request-based DM multipath device.
Against latest 'for-2.6.39/core', I just ran that same fsync heavy
workload against XFS (ontop of a DM multipath volume). ffsb induced the
following hangs (ripple effect causing NetworkManager to get hung up on
this data-only XFS volume, etc):
XFS mounting filesystem dm-0
Ending clean XFS mount for filesystem: dm-0
mount used greatest stack depth: 3296 bytes left
ffsb used greatest stack depth: 2592 bytes left
INFO: task kswapd0:23 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kswapd0 D ffff880037b8f6e0 3656 23 2 0x00000000
ffff880037b8f6d0 0000000000000046 ffff880037b8f630 ffffffff8107012f
ffff880037b8e010 ffff880037b8ffd8 00000000001d21c0 ffff880037b90600
ffff880037b90998 ffff880037b90990 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffff8107012f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffffa0150df7>] ? kmem_alloc+0x69/0xb1 [xfs]
[<ffffffff810fad18>] ? __kmalloc+0x14e/0x160
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa0138a18>] xfs_itruncate_finish+0x1fd/0x2bd [xfs]
[<ffffffffa014f202>] xfs_free_eofblocks+0x1ac/0x1f1 [xfs]
[<ffffffffa014f707>] xfs_inactive+0x108/0x3a6 [xfs]
[<ffffffff8106ff27>] ? lockdep_init_map+0xa6/0x11b
[<ffffffffa015a87f>] xfs_fs_evict_inode+0xf6/0xfe [xfs]
[<ffffffff81114766>] evict+0x24/0x8c
[<ffffffff811147ff>] dispose_list+0x31/0xaf
[<ffffffff81114e92>] shrink_icache_memory+0x1e5/0x215
[<ffffffff810d1e14>] shrink_slab+0xe0/0x164
[<ffffffff810d3e5b>] kswapd+0x5e7/0x9dc
[<ffffffff810d3874>] ? kswapd+0x0/0x9dc
[<ffffffff8105fb7c>] kthread+0xa0/0xa8
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81003a24>] kernel_thread_helper+0x4/0x10
[<ffffffff813749d4>] ? restore_args+0x0/0x30
[<ffffffff8105fadc>] ? kthread+0x0/0xa8
[<ffffffff81003a20>] ? kernel_thread_helper+0x0/0x10
4 locks held by kswapd0/23:
#0: (shrinker_rwsem){++++..}, at: [<ffffffff810d1d71>] shrink_slab+0x3d/0x164
#1: (iprune_sem){++++.-}, at: [<ffffffff81114cf7>] shrink_icache_memory+0x4a/0x215
#2: (xfs_iolock_reclaimable){+.+.-.}, at: [<ffffffffa013615d>] xfs_ilock+0x30/0xb9 [xfs]
#3: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
INFO: task NetworkManager:958 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
NetworkManager D ffff88007a481288 3312 958 1 0x00000000
ffff88007a481278 0000000000000046 ffff88007a4811d8 ffffffff8107012f
ffff88007a480010 ffff88007a481fd8 00000000001d21c0 ffff88007b4f0f80
ffff88007b4f1318 ffff88007b4f1310 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffff8107012f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffffa0150df7>] ? kmem_alloc+0x69/0xb1 [xfs]
[<ffffffff810fad18>] ? __kmalloc+0x14e/0x160
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070c11>] ? mark_held_locks+0x52/0x70
[<ffffffff810fa9f7>] ? kmem_cache_alloc+0xd1/0x145
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa011c5fa>] xfs_bmap_finish+0x87/0x16a [xfs]
[<ffffffffa01389b9>] xfs_itruncate_finish+0x19e/0x2bd [xfs]
[<ffffffffa014f202>] xfs_free_eofblocks+0x1ac/0x1f1 [xfs]
[<ffffffffa014f707>] xfs_inactive+0x108/0x3a6 [xfs]
[<ffffffff8106ff27>] ? lockdep_init_map+0xa6/0x11b
[<ffffffffa015a87f>] xfs_fs_evict_inode+0xf6/0xfe [xfs]
[<ffffffff81114766>] evict+0x24/0x8c
[<ffffffff811147ff>] dispose_list+0x31/0xaf
[<ffffffff81114e92>] shrink_icache_memory+0x1e5/0x215
[<ffffffff810d1e14>] shrink_slab+0xe0/0x164
[<ffffffff810d3282>] try_to_free_pages+0x27f/0x495
[<ffffffff810cb3fc>] __alloc_pages_nodemask+0x4e3/0x767
[<ffffffff810700a3>] ? trace_hardirqs_off_caller+0x1f/0x9e
[<ffffffff810f5b14>] alloc_pages_current+0xa7/0xca
[<ffffffff810c515c>] __page_cache_alloc+0x85/0x8c
[<ffffffff810cd420>] __do_page_cache_readahead+0xdb/0x1df
[<ffffffff810cd545>] ra_submit+0x21/0x25
[<ffffffff810c66e3>] filemap_fault+0x176/0x396
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff810e15e7>] __do_fault+0x54/0x354
[<ffffffff810709bf>] ? mark_lock+0x2d/0x22d
[<ffffffff810e2493>] handle_pte_fault+0x2cf/0x6e8
[<ffffffff810e004e>] ? __pte_alloc+0xc3/0xd0
[<ffffffff810e2986>] handle_mm_fault+0xda/0xed
[<ffffffff81377c28>] do_page_fault+0x3b4/0x3d6
[<ffffffff8118e6de>] ? fsnotify_perm+0x69/0x75
[<ffffffff8118e74b>] ? security_file_permission+0x2e/0x33
[<ffffffff813739e6>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[<ffffffff81374be5>] page_fault+0x25/0x30
5 locks held by NetworkManager/958:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff81377a36>] do_page_fault+0x1c2/0x3d6
#1: (shrinker_rwsem){++++..}, at: [<ffffffff810d1d71>] shrink_slab+0x3d/0x164
#2: (iprune_sem){++++.-}, at: [<ffffffff81114cf7>] shrink_icache_memory+0x4a/0x215
#3: (xfs_iolock_reclaimable){+.+.-.}, at: [<ffffffffa013615d>] xfs_ilock+0x30/0xb9 [xfs]
#4: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
INFO: task xfssyncd/dm-0:1346 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
xfssyncd/dm-0 D ffff880072cb1a20 4824 1346 2 0x00000000
ffff880072cb1a10 0000000000000046 ffff880072cb1970 ffffffff8107012f
ffff880072cb0010 ffff880072cb1fd8 00000000001d21c0 ffff88007b22ca00
ffff88007b22cd98 ffff88007b22cd90 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffff8107012f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffff81065905>] ? sched_clock_local+0x1c/0x82
[<ffffffff810700a3>] ? trace_hardirqs_off_caller+0x1f/0x9e
[<ffffffff810709bf>] ? mark_lock+0x2d/0x22d
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa0150cdd>] ? kmem_zone_alloc+0x69/0xb1 [xfs]
[<ffffffffa014af66>] ? xfs_trans_add_item+0x50/0x5c [xfs]
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa0133289>] xfs_fs_log_dummy+0x76/0x7d [xfs]
[<ffffffffa015cd3d>] xfs_sync_worker+0x37/0x6f [xfs]
[<ffffffffa015ccb0>] xfssyncd+0x15b/0x1b1 [xfs]
[<ffffffffa015cb55>] ? xfssyncd+0x0/0x1b1 [xfs]
[<ffffffff8105fb7c>] kthread+0xa0/0xa8
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81003a24>] kernel_thread_helper+0x4/0x10
[<ffffffff813749d4>] ? restore_args+0x0/0x30
[<ffffffff8105fadc>] ? kthread+0x0/0xa8
[<ffffffff81003a20>] ? kernel_thread_helper+0x0/0x10
no locks held by xfssyncd/dm-0/1346.
INFO: task ffsb:1355 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ffsb D 000000010002503a 3648 1355 1322 0x00000000
ffff88007baffae8 0000000000000046 ffff88007baffa48 ffffffff00000000
ffff88007bafe010 ffff88007bafffd8 00000000001d21c0 ffff880071df4680
ffff880071df4a18 ffff880071df4a10 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffff810727da>] ? __lock_acquire+0x3bc/0xd26
[<ffffffff81065a7a>] ? local_clock+0x41/0x5a
[<ffffffff81024167>] ? pvclock_clocksource_read+0x4b/0xb4
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffff81065905>] ? sched_clock_local+0x1c/0x82
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070c11>] ? mark_held_locks+0x52/0x70
[<ffffffff810fa9f7>] ? kmem_cache_alloc+0xd1/0x145
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa0150cdd>] ? kmem_zone_alloc+0x69/0xb1 [xfs]
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa01566f8>] xfs_file_fsync+0x166/0x1e6 [xfs]
[<ffffffff81122a8b>] vfs_fsync_range+0x54/0x7c
[<ffffffff81122b15>] vfs_fsync+0x1c/0x1e
[<ffffffff81122b45>] do_fsync+0x2e/0x43
[<ffffffff81122b81>] sys_fsync+0x10/0x14
[<ffffffff81002b82>] system_call_fastpath+0x16/0x1b
2 locks held by ffsb/1355:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
INFO: task ffsb:1364 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ffsb D 0000000100024ff6 3776 1364 1322 0x00000000
ffff880027d25ae8 0000000000000046 ffff880027d25a48 ffffffff00000000
ffff880027d24010 ffff880027d25fd8 00000000001d21c0 ffff88002839c8c0
ffff88002839cc58 ffff88002839cc50 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffffa0150df7>] ? kmem_alloc+0x69/0xb1 [xfs]
[<ffffffff810fad18>] ? __kmalloc+0x14e/0x160
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070c11>] ? mark_held_locks+0x52/0x70
[<ffffffff810fa9f7>] ? kmem_cache_alloc+0xd1/0x145
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa0150cdd>] ? kmem_zone_alloc+0x69/0xb1 [xfs]
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa01566f8>] xfs_file_fsync+0x166/0x1e6 [xfs]
[<ffffffff81122a8b>] vfs_fsync_range+0x54/0x7c
[<ffffffff81122b15>] vfs_fsync+0x1c/0x1e
[<ffffffff81122b45>] do_fsync+0x2e/0x43
[<ffffffff81122b81>] sys_fsync+0x10/0x14
[<ffffffff81002b82>] system_call_fastpath+0x16/0x1b
2 locks held by ffsb/1364:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
(and many more ffsb processes hung similar to the 2 above)
I just attempted a git command against the root volume, it hung:
git D 0000000100252022 3440 1471 1461 0x00000004
ffff88003ad611d8 0000000000000046 ffff88003ad61138 ffffffff00000000
ffff88003ad60010 ffff88003ad61fd8 00000000001d21c0 ffff88003b498d40
ffff88003b4990d8 ffff88003b4990d0 00000000001d21c0 00000000001d21c0
Call Trace:
[<ffffffffa013e958>] xlog_wait+0x60/0x78 [xfs]
[<ffffffff810404de>] ? default_wake_function+0x0/0x14
[<ffffffff81373c5f>] ? _raw_spin_lock+0x62/0x69
[<ffffffffa013f874>] xlog_state_get_iclog_space+0x9e/0x22c [xfs]
[<ffffffffa013fb73>] xlog_write+0x171/0x4ae [xfs]
[<ffffffffa0150df7>] ? kmem_alloc+0x69/0xb1 [xfs]
[<ffffffff810fad18>] ? __kmalloc+0x14e/0x160
[<ffffffffa013ff04>] xfs_log_write+0x54/0x7e [xfs]
[<ffffffffa014b5b0>] xfs_trans_commit_iclog+0x195/0x2d8 [xfs]
[<ffffffff81070c11>] ? mark_held_locks+0x52/0x70
[<ffffffff810fa9f7>] ? kmem_cache_alloc+0xd1/0x145
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa014b7bc>] _xfs_trans_commit+0xc9/0x206 [xfs]
[<ffffffffa011c5fa>] xfs_bmap_finish+0x87/0x16a [xfs]
[<ffffffffa01389b9>] xfs_itruncate_finish+0x19e/0x2bd [xfs]
[<ffffffffa014f202>] xfs_free_eofblocks+0x1ac/0x1f1 [xfs]
[<ffffffffa014f707>] xfs_inactive+0x108/0x3a6 [xfs]
[<ffffffff8106ff27>] ? lockdep_init_map+0xa6/0x11b
[<ffffffffa015a87f>] xfs_fs_evict_inode+0xf6/0xfe [xfs]
[<ffffffff81114766>] evict+0x24/0x8c
[<ffffffff811147ff>] dispose_list+0x31/0xaf
[<ffffffff81114e92>] shrink_icache_memory+0x1e5/0x215
[<ffffffff810d1e14>] shrink_slab+0xe0/0x164
[<ffffffff810d3282>] try_to_free_pages+0x27f/0x495
[<ffffffff810cb3fc>] __alloc_pages_nodemask+0x4e3/0x767
[<ffffffff810700a3>] ? trace_hardirqs_off_caller+0x1f/0x9e
[<ffffffff810f5b14>] alloc_pages_current+0xa7/0xca
[<ffffffff810c515c>] __page_cache_alloc+0x85/0x8c
[<ffffffff810cd420>] __do_page_cache_readahead+0xdb/0x1df
[<ffffffff8106fa35>] ? lock_release_holdtime+0x2c/0xd7
[<ffffffff810cd545>] ra_submit+0x21/0x25
[<ffffffff810cd92c>] ondemand_readahead+0x1e3/0x1f6
[<ffffffff810cd9b8>] page_cache_async_readahead+0x79/0x82
[<ffffffff810c6633>] filemap_fault+0xc6/0x396
[<ffffffff81070ece>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff810e15e7>] __do_fault+0x54/0x354
[<ffffffff810709bf>] ? mark_lock+0x2d/0x22d
[<ffffffff810e2493>] handle_pte_fault+0x2cf/0x6e8
[<ffffffff810e004e>] ? __pte_alloc+0xc3/0xd0
[<ffffffff810e2986>] handle_mm_fault+0xda/0xed
[<ffffffff81377c28>] do_page_fault+0x3b4/0x3d6
[<ffffffff810700a3>] ? trace_hardirqs_off_caller+0x1f/0x9e
[<ffffffff8107012f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffff81065a7a>] ? local_clock+0x41/0x5a
[<ffffffff813739e6>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[<ffffffff81374be5>] page_fault+0x25/0x30
And here is the summary of all the locks (via sysrq-t):
Showing all locks held in the system:
2 locks held by kworker/0:1/10:
#0: (xfsdatad){++++..}, at: [<ffffffff81059fba>] process_one_work+0x18a/0x37f
#1: ((&ioend->io_work)){+.+...}, at: [<ffffffff81059fba>] process_one_work+0x18a/0x37f
4 locks held by kswapd0/23:
#0: (shrinker_rwsem){++++..}, at: [<ffffffff810d1d71>] shrink_slab+0x3d/0x164
#1: (iprune_sem){++++.-}, at: [<ffffffff81114cf7>] shrink_icache_memory+0x4a/0x215
#2: (xfs_iolock_reclaimable){+.+.-.}, at: [<ffffffffa013615d>] xfs_ilock+0x30/0xb9 [xfs]
#3: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by multipathd/659:
#0: (&u->readlock){+.+.+.}, at: [<ffffffff813533dd>] unix_dgram_recvmsg+0x5a/0x27f
5 locks held by NetworkManager/958:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff81377a36>] do_page_fault+0x1c2/0x3d6
#1: (shrinker_rwsem){++++..}, at: [<ffffffff810d1d71>] shrink_slab+0x3d/0x164
#2: (iprune_sem){++++.-}, at: [<ffffffff81114cf7>] shrink_icache_memory+0x4a/0x215
#3: (xfs_iolock_reclaimable){+.+.-.}, at: [<ffffffffa013615d>] xfs_ilock+0x30/0xb9 [xfs]
#4: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by agetty/1099:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1101:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1103:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1105:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1107:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1109:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by mingetty/1111:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
1 lock held by bash/1313:
#0: (&tty->atomic_read_lock){+.+.+.}, at: [<ffffffff8123c132>] n_tty_read+0x284/0x7ba
2 locks held by ffsb/1355:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1358:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by ffsb/1359:
#0: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1362:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1364:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1365:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1366:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1367:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by ffsb/1368:
#0: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1371:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1372:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1373:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1374:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1375:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1376:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1377:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1378:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1380:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1381:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1383:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
4 locks held by ffsb/1384:
#0: (&sb->s_type->i_mutex_key#13/1){+.+.+.}, at: [<ffffffff8110b9ba>] do_unlinkat+0x67/0x165
#1: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81109eaf>] vfs_unlink+0x4f/0xcc
#2: (&(&ip->i_lock)->mr_lock/2){+.+...}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
#3: (&(&ip->i_lock)->mr_lock/3){+.+...}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1385:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1386:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1387:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1388:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at:
2 locks held by ffsb/1389:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by ffsb/1390:
#0: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1391:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1392:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1393:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by ffsb/1394:
#0: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1395:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1396:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1397:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1398:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1399:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1400:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1402:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1403:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
1 lock held by ffsb/1404:
#0: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1405:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1406:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1407:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1409:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1410:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1411:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1412:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1413:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1414:
#0: (xfs_iolock_active){++++.+}, at: [<ffffffffa0136079>] xfs_ilock_nowait+0x2b/0xdf [xfs]
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1416:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by ffsb/1417:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff81122a7e>] vfs_fsync_range+0x47/0x7c
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
3 locks held by ffsb/1418:
#0: (&sb->s_type->i_mutex_key#13){+.+.+.}, at: [<ffffffff8110a688>] do_last+0xb8/0x2f9
#1: (&(&ip->i_lock)->mr_lock/1){+.+.+.}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
#2: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa01360a7>] xfs_ilock_nowait+0x59/0xdf [xfs]
2 locks held by flush-253:0/1350:
#0: (&type->s_umount_key#24){.+.+.+}, at: [<ffffffff8111f509>] writeback_inodes_wb+0xce/0x13d
#1: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
5 locks held by git/1471:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff81377a36>] do_page_fault+0x1c2/0x3d6
#1: (shrinker_rwsem){++++..}, at: [<ffffffff810d1d71>] shrink_slab+0x3d/0x164
#2: (iprune_sem){++++.-}, at: [<ffffffff81114cf7>] shrink_icache_memory+0x4a/0x215
#3: (xfs_iolock_reclaimable){+.+.-.}, at: [<ffffffffa013615d>] xfs_ilock+0x30/0xb9 [xfs]
#4: (&(&ip->i_lock)->mr_lock){++++--}, at: [<ffffffffa0136190>] xfs_ilock+0x63/0xb9 [xfs]
2 locks held by bash/1472:
#0: (sysrq_key_table_lock){......}, at: [<ffffffff81242275>] __handle_sysrq+0x28/0x15c
#1: (tasklist_lock){.+.+..}, at: [<ffffffff8107062c>] debug_show_all_locks+0x52/0x19b
On 2011-03-17 16:51, Mike Snitzer wrote:
> On Tue, Mar 08 2011 at 5:05pm -0500,
> Mike Snitzer <[email protected]> wrote:
>
>> On Tue, Mar 08 2011 at 3:27pm -0500,
>> Jens Axboe <[email protected]> wrote:
>>
>>> On 2011-03-08 21:21, Mike Snitzer wrote:
>>>> On Tue, Mar 08 2011 at 7:16am -0500,
>>>> Jens Axboe <[email protected]> wrote:
>>>>
>>>>> On 2011-03-03 23:13, Mike Snitzer wrote:
>>>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
>>>>>> kernel, when I try an fsync heavy workload to a request-based mpath
>>>>>> device (the kernel ultimately goes down in flames, I've yet to look at
>>>>>> the crashdump I took)
>>>>>
>>>>> Mike, can you re-run with the current stack-plug branch? I've fixed the
>>>>> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
>>>>> on schedule event. It's run outside of the runqueue lock now, so
>>>>> hopefully that should solve this one.
>>>>
>>>> Works for me, thanks.
>>>
>>> Super, thanks! Out of curiousity, did you use dm/md?
>>
>> Yes, I've been using a request-based DM multipath device.
>
>
> Against latest 'for-2.6.39/core', I just ran that same fsync heavy
> workload against XFS (ontop of a DM multipath volume). ffsb induced the
> following hangs (ripple effect causing NetworkManager to get hung up on
> this data-only XFS volume, etc):
Ugh. Care to send the recipee for how to reproduce this? Essentially
just looks like IO got stuck.
--
Jens Axboe
On Thu, Mar 17 2011 at 2:31pm -0400,
Jens Axboe <[email protected]> wrote:
> On 2011-03-17 16:51, Mike Snitzer wrote:
> > On Tue, Mar 08 2011 at 5:05pm -0500,
> > Mike Snitzer <[email protected]> wrote:
> >
> >> On Tue, Mar 08 2011 at 3:27pm -0500,
> >> Jens Axboe <[email protected]> wrote:
> >>
> >>> On 2011-03-08 21:21, Mike Snitzer wrote:
> >>>> On Tue, Mar 08 2011 at 7:16am -0500,
> >>>> Jens Axboe <[email protected]> wrote:
> >>>>
> >>>>> On 2011-03-03 23:13, Mike Snitzer wrote:
> >>>>>> I'm now hitting a lockdep issue, while running a 'for-2.6.39/stack-plug'
> >>>>>> kernel, when I try an fsync heavy workload to a request-based mpath
> >>>>>> device (the kernel ultimately goes down in flames, I've yet to look at
> >>>>>> the crashdump I took)
> >>>>>
> >>>>> Mike, can you re-run with the current stack-plug branch? I've fixed the
> >>>>> !CONFIG_BLOCK and rebase issues, and also added a change for this flush
> >>>>> on schedule event. It's run outside of the runqueue lock now, so
> >>>>> hopefully that should solve this one.
> >>>>
> >>>> Works for me, thanks.
> >>>
> >>> Super, thanks! Out of curiousity, did you use dm/md?
> >>
> >> Yes, I've been using a request-based DM multipath device.
> >
> >
> > Against latest 'for-2.6.39/core', I just ran that same fsync heavy
> > workload against XFS (ontop of a DM multipath volume). ffsb induced the
> > following hangs (ripple effect causing NetworkManager to get hung up on
> > this data-only XFS volume, etc):
>
> Ugh. Care to send the recipee for how to reproduce this? Essentially
> just looks like IO got stuck.
Here is the sequence to reproduce with the attached fsync-happy.ffsb
(I've been running the following in a KVM guest):
<create multipath device>
mkfs.xfs /dev/mapper/mpathb
mount /dev/mapper/mpathb /mnt/test
./ffsb fsync-happy.ffsb
And I just verified that the deadlock does _not_ seem to occur without
DM multipath -- by directly using an underlying SCSI device instead.
So multipath is exposing this somehow (could just be changing timing?).
Mike
p.s. though I did get this lockdep warning when unmounting the xfs
filesystem:
=================================
[ INFO: inconsistent lock state ]
2.6.38-rc6-snitm+ #8
---------------------------------
inconsistent {IN-RECLAIM_FS-R} -> {RECLAIM_FS-ON-W} usage.
umount/1524 [HC0[0]:SC0[0]:HE1:SE1] takes:
(iprune_sem){+++++-}, at: [<ffffffff81114a22>] evict_inodes+0x2f/0x107
{IN-RECLAIM_FS-R} state was registered at:
[<ffffffff810727c2>] __lock_acquire+0x3a4/0xd26
[<ffffffff81073227>] lock_acquire+0xe3/0x110
[<ffffffff81372fa2>] down_read+0x51/0x96
[<ffffffff81114d57>] shrink_icache_memory+0x4a/0x215
[<ffffffff810d1e48>] shrink_slab+0xe0/0x164
[<ffffffff810d3e8f>] kswapd+0x5e7/0x9dc
[<ffffffff8105fb7c>] kthread+0xa0/0xa8
[<ffffffff81003a24>] kernel_thread_helper+0x4/0x10
irq event stamp: 73433
hardirqs last enabled at (73433): [<ffffffff81070ffe>] debug_check_no_locks_freed+0x12e/0x145
hardirqs last disabled at (73432): [<ffffffff81070f13>] debug_check_no_locks_freed+0x43/0x145
softirqs last enabled at (72996): [<ffffffff8104a1f1>] __do_softirq+0x1b4/0x1d3
softirqs last disabled at (72991): [<ffffffff81003b1c>] call_softirq+0x1c/0x28
other info that might help us debug this:
2 locks held by umount/1524:
#0: (&type->s_umount_key#24){++++++}, at: [<ffffffff81102a27>] deactivate_super+0x3d/0x4a
#1: (iprune_sem){+++++-}, at: [<ffffffff81114a22>] evict_inodes+0x2f/0x107
stack backtrace:
Pid: 1524, comm: umount Not tainted 2.6.38-rc6-snitm+ #8
Call Trace:
[<ffffffff8107097f>] ? valid_state+0x17e/0x191
[<ffffffff810712e8>] ? check_usage_backwards+0x0/0x81
[<ffffffff81070ae4>] ? mark_lock+0x152/0x22d
[<ffffffff81070c11>] ? mark_held_locks+0x52/0x70
[<ffffffff81070cc8>] ? lockdep_trace_alloc+0x99/0xbb
[<ffffffff810fa98a>] ? kmem_cache_alloc+0x30/0x145
[<ffffffffa014dcdd>] ? kmem_zone_alloc+0x69/0xb1 [xfs]
[<ffffffffa014dd39>] ? kmem_zone_zalloc+0x14/0x35 [xfs]
[<ffffffffa0147ed9>] ? _xfs_trans_alloc+0x27/0x64 [xfs]
[<ffffffffa0148c97>] ? xfs_trans_alloc+0x9f/0xac [xfs]
[<ffffffff810643b7>] ? up_read+0x23/0x3c
[<ffffffffa0133000>] ? xfs_iunlock+0x7e/0xbc [xfs]
[<ffffffffa014c140>] ? xfs_free_eofblocks+0xea/0x1f1 [xfs]
[<ffffffffa014c707>] ? xfs_inactive+0x108/0x3a6 [xfs]
[<ffffffff8106ff27>] ? lockdep_init_map+0xa6/0x11b
[<ffffffffa015787f>] ? xfs_fs_evict_inode+0xf6/0xfe [xfs]
[<ffffffff811147c6>] ? evict+0x24/0x8c
[<ffffffff8111485f>] ? dispose_list+0x31/0xaf
[<ffffffff81114ae3>] ? evict_inodes+0xf0/0x107
[<ffffffff81101660>] ? generic_shutdown_super+0x5c/0xdf
[<ffffffff8110170a>] ? kill_block_super+0x27/0x69
[<ffffffff81101d89>] ? deactivate_locked_super+0x26/0x4b
[<ffffffff81102a2f>] ? deactivate_super+0x45/0x4a
[<ffffffff81118b87>] ? mntput_no_expire+0x105/0x10e
[<ffffffff81119db6>] ? sys_umount+0x2d9/0x304
[<ffffffff81070e9d>] ? trace_hardirqs_on_caller+0x11d/0x141
[<ffffffff81002b82>] ? system_call_fastpath+0x16/0x1b
> p.s. though I did get this lockdep warning when unmounting the xfs
> filesystem:
This is fixed by commit bab1d9444d9a147f1dc3478dd06c16f490227f3e
"prune back iprune_sem"
which hit mainline this week.
On Wed, 9 Mar 2011 19:58:10 -0500 Mike Snitzer <[email protected]> wrote:
> Also, in your MD changes, you removed all calls to md_unplug() but
> didn't remove md_unplug(). Seems it should be removed along with the
> 'plug' member of 'struct mddev_t'? Neil?
I've been distracted by other things and only just managed to have a look at
this.
The new plugging code seems to completely ignore the needs of stacked devices
- or at least my needs in md.
For RAID1 with a write-intent-bitmap, I queue all write requests and then on
an unplug I update the write-intent-bitmap to mark all the relevant blocks
and then release the writes.
With the new code there is no way for an unplug event to wake up the raid1d
thread to start the writeout - I haven't tested it but I suspect it will just
hang.
Similarly for RAID5 I gather write bios (long before they become 'struct
request' which is what the plugging code understands) and on an unplug event
I release the writes - hopefully with enough bios per stripe so that we don't
need to pre-read.
Possibly the simplest fix would be to have a second list_head in 'struct
blk_plug' which contained callbacks (a function pointer a list_head in a
struct which is passed as an arg to the function!).
blk_finish_plug could then walk the list and call the call-backs.
It would be quite easy to hook into that.
I suspect I also need to add blk_start_plug/blk_finish_plug around the loop
in raid1d/raid5d/raid10d, but that is pretty straight forward.
Am I missing something important?
Is there a better way to get an unplug event to md?
Thanks,
NeilBrown
On Tue, 5 Apr 2011 13:05:41 +1000 NeilBrown <[email protected]> wrote:
> On Wed, 9 Mar 2011 19:58:10 -0500 Mike Snitzer <[email protected]> wrote:
>
> > Also, in your MD changes, you removed all calls to md_unplug() but
> > didn't remove md_unplug(). Seems it should be removed along with the
> > 'plug' member of 'struct mddev_t'? Neil?
>
> I've been distracted by other things and only just managed to have a look at
> this.
>
> The new plugging code seems to completely ignore the needs of stacked devices
> - or at least my needs in md.
>
> For RAID1 with a write-intent-bitmap, I queue all write requests and then on
> an unplug I update the write-intent-bitmap to mark all the relevant blocks
> and then release the writes.
>
> With the new code there is no way for an unplug event to wake up the raid1d
> thread to start the writeout - I haven't tested it but I suspect it will just
> hang.
>
> Similarly for RAID5 I gather write bios (long before they become 'struct
> request' which is what the plugging code understands) and on an unplug event
> I release the writes - hopefully with enough bios per stripe so that we don't
> need to pre-read.
>
> Possibly the simplest fix would be to have a second list_head in 'struct
> blk_plug' which contained callbacks (a function pointer a list_head in a
> struct which is passed as an arg to the function!).
> blk_finish_plug could then walk the list and call the call-backs.
> It would be quite easy to hook into that.
I've implemented this and it seems to work.
Jens: could you please review and hopefully ack the patch below, and let
me know if you will submit it or should I?
My testing of this combined with some other patches which cause various md
personalities to use it shows up a bug somewhere.
The symptoms are crashes in various places in blk-core and sometimes
elevator.c
list_sort occurs fairly often included in the stack but not always.
This patch
diff --git a/block/blk-core.c b/block/blk-core.c
index 273d60b..903ce8d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2674,19 +2674,23 @@ static void flush_plug_list(struct blk_plug *plug)
struct request_queue *q;
unsigned long flags;
struct request *rq;
+ struct list_head head;
BUG_ON(plug->magic != PLUG_MAGIC);
if (list_empty(&plug->list))
return;
+ list_add(&head, &plug->list);
+ list_del_init(&plug->list);
if (plug->should_sort)
- list_sort(NULL, &plug->list, plug_rq_cmp);
+ list_sort(NULL, &head, plug_rq_cmp);
+ plug->should_sort = 0;
q = NULL;
local_irq_save(flags);
- while (!list_empty(&plug->list)) {
- rq = list_entry_rq(plug->list.next);
+ while (!list_empty(&head)) {
+ rq = list_entry_rq(head.next);
list_del_init(&rq->queuelist);
BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
BUG_ON(!rq->q);
makes the symptom go away. It simply moves the plug list onto a separate
list head before sorting and processing it.
My test was simply writing to a RAID1 with dd:
while true; do dd if=/dev/zero of=/dev/md0 size=4k; done
Obviously all writes go to two devices so the plug list will always need
sorting.
The only explanation I can come up with is that very occasionally schedule on
2 separate cpus calls blk_flush_plug for the same task. I don't understand
the scheduler nearly well enough to know if or how that can happen.
However with this patch in place I can write to a RAID1 constantly for half
an hour, and without it, the write rarely lasts for 3 minutes.
If you want to reproduce my experiment, you can pull from
git://neil.brown.name/md plug-test
to get my patches for plugging in md (which are quite ready for submission
but seem to work), create a RAID1 using e.g.
mdadm -C /dev/md0 --level=1 --raid-disks=2 /dev/device1 /dev/device2
while true; do dd if=/dev/zero of=/dev/md0 bs=4K ; done
Thanks,
NeilBrown
>From 687b189c02276887dd7d5b87a817da9f67ed3c2c Mon Sep 17 00:00:00 2001
From: NeilBrown <[email protected]>
Date: Thu, 7 Apr 2011 13:16:59 +1000
Subject: [PATCH] Enhance new plugging support to support general callbacks.
md/raid requires an unplug callback, but as it does not uses
requests the current code cannot provide one.
So allow arbitrary callbacks to be attached to the blk_plug.
Cc: Jens Axboe <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
---
block/blk-core.c | 13 +++++++++++++
include/linux/blkdev.h | 7 ++++++-
2 files changed, 19 insertions(+), 1 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 725091d..273d60b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2644,6 +2644,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->cb_list);
plug->should_sort = 0;
/*
@@ -2717,9 +2718,21 @@ static void flush_plug_list(struct blk_plug *plug)
local_irq_restore(flags);
}
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+ while (!list_empty(&plug->cb_list)) {
+ struct blk_plug_cb *cb = list_first_entry(&plug->cb_list,
+ struct blk_plug_cb,
+ list);
+ list_del(&cb->list);
+ cb->callback(cb);
+ }
+}
+
static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
{
flush_plug_list(plug);
+ flush_plug_callbacks(plug);
if (plug == tsk->plug)
tsk->plug = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32176cc..3e5e604 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -857,8 +857,13 @@ extern void blk_put_queue(struct request_queue *);
struct blk_plug {
unsigned long magic;
struct list_head list;
+ struct list_head cb_list;
unsigned int should_sort;
};
+struct blk_plug_cb {
+ struct list_head list;
+ void (*callback)(struct blk_plug_cb *);
+};
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
@@ -876,7 +881,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
- return plug && !list_empty(&plug->list);
+ return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
}
/*
--
1.7.3.4
On 2011-04-11 06:50, NeilBrown wrote:
> On Tue, 5 Apr 2011 13:05:41 +1000 NeilBrown <[email protected]> wrote:
>
>> On Wed, 9 Mar 2011 19:58:10 -0500 Mike Snitzer <[email protected]> wrote:
>>
>>> Also, in your MD changes, you removed all calls to md_unplug() but
>>> didn't remove md_unplug(). Seems it should be removed along with the
>>> 'plug' member of 'struct mddev_t'? Neil?
>>
>> I've been distracted by other things and only just managed to have a look at
>> this.
>>
>> The new plugging code seems to completely ignore the needs of stacked devices
>> - or at least my needs in md.
>>
>> For RAID1 with a write-intent-bitmap, I queue all write requests and then on
>> an unplug I update the write-intent-bitmap to mark all the relevant blocks
>> and then release the writes.
>>
>> With the new code there is no way for an unplug event to wake up the raid1d
>> thread to start the writeout - I haven't tested it but I suspect it will just
>> hang.
>>
>> Similarly for RAID5 I gather write bios (long before they become 'struct
>> request' which is what the plugging code understands) and on an unplug event
>> I release the writes - hopefully with enough bios per stripe so that we don't
>> need to pre-read.
>>
>> Possibly the simplest fix would be to have a second list_head in 'struct
>> blk_plug' which contained callbacks (a function pointer a list_head in a
>> struct which is passed as an arg to the function!).
>> blk_finish_plug could then walk the list and call the call-backs.
>> It would be quite easy to hook into that.
>
> I've implemented this and it seems to work.
> Jens: could you please review and hopefully ack the patch below, and let
> me know if you will submit it or should I?
>
> My testing of this combined with some other patches which cause various md
> personalities to use it shows up a bug somewhere.
>
> The symptoms are crashes in various places in blk-core and sometimes
> elevator.c
> list_sort occurs fairly often included in the stack but not always.
>
> This patch
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 273d60b..903ce8d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2674,19 +2674,23 @@ static void flush_plug_list(struct blk_plug *plug)
> struct request_queue *q;
> unsigned long flags;
> struct request *rq;
> + struct list_head head;
>
> BUG_ON(plug->magic != PLUG_MAGIC);
>
> if (list_empty(&plug->list))
> return;
> + list_add(&head, &plug->list);
> + list_del_init(&plug->list);
>
> if (plug->should_sort)
> - list_sort(NULL, &plug->list, plug_rq_cmp);
> + list_sort(NULL, &head, plug_rq_cmp);
> + plug->should_sort = 0;
>
> q = NULL;
> local_irq_save(flags);
> - while (!list_empty(&plug->list)) {
> - rq = list_entry_rq(plug->list.next);
> + while (!list_empty(&head)) {
> + rq = list_entry_rq(head.next);
> list_del_init(&rq->queuelist);
> BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
> BUG_ON(!rq->q);
>
>
> makes the symptom go away. It simply moves the plug list onto a separate
> list head before sorting and processing it.
> My test was simply writing to a RAID1 with dd:
> while true; do dd if=/dev/zero of=/dev/md0 size=4k; done
>
> Obviously all writes go to two devices so the plug list will always need
> sorting.
>
> The only explanation I can come up with is that very occasionally schedule on
> 2 separate cpus calls blk_flush_plug for the same task. I don't understand
> the scheduler nearly well enough to know if or how that can happen.
> However with this patch in place I can write to a RAID1 constantly for half
> an hour, and without it, the write rarely lasts for 3 minutes.
Or perhaps if the request_fn blocks, that would be problematic. So the
patch is likely a good idea even for that case.
I'll merge it, changing it to list_splice_init() as I think that would
be more clear.
> From 687b189c02276887dd7d5b87a817da9f67ed3c2c Mon Sep 17 00:00:00 2001
> From: NeilBrown <[email protected]>
> Date: Thu, 7 Apr 2011 13:16:59 +1000
> Subject: [PATCH] Enhance new plugging support to support general callbacks.
>
> md/raid requires an unplug callback, but as it does not uses
> requests the current code cannot provide one.
>
> So allow arbitrary callbacks to be attached to the blk_plug.
>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> block/blk-core.c | 13 +++++++++++++
> include/linux/blkdev.h | 7 ++++++-
> 2 files changed, 19 insertions(+), 1 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 725091d..273d60b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2644,6 +2644,7 @@ void blk_start_plug(struct blk_plug *plug)
>
> plug->magic = PLUG_MAGIC;
> INIT_LIST_HEAD(&plug->list);
> + INIT_LIST_HEAD(&plug->cb_list);
> plug->should_sort = 0;
>
> /*
> @@ -2717,9 +2718,21 @@ static void flush_plug_list(struct blk_plug *plug)
> local_irq_restore(flags);
> }
>
> +static void flush_plug_callbacks(struct blk_plug *plug)
> +{
> + while (!list_empty(&plug->cb_list)) {
> + struct blk_plug_cb *cb = list_first_entry(&plug->cb_list,
> + struct blk_plug_cb,
> + list);
> + list_del(&cb->list);
> + cb->callback(cb);
> + }
> +}
> +
> static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
> {
> flush_plug_list(plug);
> + flush_plug_callbacks(plug);
>
> if (plug == tsk->plug)
> tsk->plug = NULL;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 32176cc..3e5e604 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -857,8 +857,13 @@ extern void blk_put_queue(struct request_queue *);
> struct blk_plug {
> unsigned long magic;
> struct list_head list;
> + struct list_head cb_list;
> unsigned int should_sort;
> };
> +struct blk_plug_cb {
> + struct list_head list;
> + void (*callback)(struct blk_plug_cb *);
> +};
>
> extern void blk_start_plug(struct blk_plug *);
> extern void blk_finish_plug(struct blk_plug *);
> @@ -876,7 +881,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
> {
> struct blk_plug *plug = tsk->plug;
>
> - return plug && !list_empty(&plug->list);
> + return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
> }
>
> /*
Maybe I'm missing something, but why do you need those callbacks? If
it's to use plugging yourself, perhaps we can just ensure that those
don't get assigned in the task - so it would be have to used with care.
It's not that I disagree to these callbacks, I just want to ensure I
understand why you need them.
--
Jens Axboe
On Mon, 11 Apr 2011 11:19:58 +0200 Jens Axboe <[email protected]> wrote:
> On 2011-04-11 06:50, NeilBrown wrote:
> > The only explanation I can come up with is that very occasionally schedule on
> > 2 separate cpus calls blk_flush_plug for the same task. I don't understand
> > the scheduler nearly well enough to know if or how that can happen.
> > However with this patch in place I can write to a RAID1 constantly for half
> > an hour, and without it, the write rarely lasts for 3 minutes.
>
> Or perhaps if the request_fn blocks, that would be problematic. So the
> patch is likely a good idea even for that case.
>
> I'll merge it, changing it to list_splice_init() as I think that would
> be more clear.
OK - though I'm not 100% the patch fixes the problem - just that it hides the
symptom for me.
I might try instrumenting the code a bit more and see if I can find exactly
where it is re-entering flush_plug_list - as that seems to be what is
happening.
And yeah - list_split_init is probably better. I just never remember exactly
what list_split means and have to look it up every time, where as
list_add/list_del are very clear to me.
>
> > From 687b189c02276887dd7d5b87a817da9f67ed3c2c Mon Sep 17 00:00:00 2001
> > From: NeilBrown <[email protected]>
> > Date: Thu, 7 Apr 2011 13:16:59 +1000
> > Subject: [PATCH] Enhance new plugging support to support general callbacks.
> >
> > md/raid requires an unplug callback, but as it does not uses
> > requests the current code cannot provide one.
> >
> > So allow arbitrary callbacks to be attached to the blk_plug.
> >
> > Cc: Jens Axboe <[email protected]>
> > Signed-off-by: NeilBrown <[email protected]>
> > ---
> > block/blk-core.c | 13 +++++++++++++
> > include/linux/blkdev.h | 7 ++++++-
> > 2 files changed, 19 insertions(+), 1 deletions(-)
> >
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 725091d..273d60b 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -2644,6 +2644,7 @@ void blk_start_plug(struct blk_plug *plug)
> >
> > plug->magic = PLUG_MAGIC;
> > INIT_LIST_HEAD(&plug->list);
> > + INIT_LIST_HEAD(&plug->cb_list);
> > plug->should_sort = 0;
> >
> > /*
> > @@ -2717,9 +2718,21 @@ static void flush_plug_list(struct blk_plug *plug)
> > local_irq_restore(flags);
> > }
> >
> > +static void flush_plug_callbacks(struct blk_plug *plug)
> > +{
> > + while (!list_empty(&plug->cb_list)) {
> > + struct blk_plug_cb *cb = list_first_entry(&plug->cb_list,
> > + struct blk_plug_cb,
> > + list);
> > + list_del(&cb->list);
> > + cb->callback(cb);
> > + }
> > +}
> > +
> > static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
> > {
> > flush_plug_list(plug);
> > + flush_plug_callbacks(plug);
> >
> > if (plug == tsk->plug)
> > tsk->plug = NULL;
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index 32176cc..3e5e604 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -857,8 +857,13 @@ extern void blk_put_queue(struct request_queue *);
> > struct blk_plug {
> > unsigned long magic;
> > struct list_head list;
> > + struct list_head cb_list;
> > unsigned int should_sort;
> > };
> > +struct blk_plug_cb {
> > + struct list_head list;
> > + void (*callback)(struct blk_plug_cb *);
> > +};
> >
> > extern void blk_start_plug(struct blk_plug *);
> > extern void blk_finish_plug(struct blk_plug *);
> > @@ -876,7 +881,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
> > {
> > struct blk_plug *plug = tsk->plug;
> >
> > - return plug && !list_empty(&plug->list);
> > + return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
> > }
> >
> > /*
>
> Maybe I'm missing something, but why do you need those callbacks? If
> it's to use plugging yourself, perhaps we can just ensure that those
> don't get assigned in the task - so it would be have to used with care.
>
> It's not that I disagree to these callbacks, I just want to ensure I
> understand why you need them.
>
I'm sure one of us is missing something (probably both) but I'm not sure what.
The callback is central.
It is simply to use plugging in md.
Just like blk-core, md will notice that a blk_plug is active and will put
requests aside. I then need something to call in to md when blk_finish_plug
is called so that put-aside requests can be released.
As md can be built as a module, that call must be a call-back of some sort.
blk-core doesn't need to register blk_plug_flush because that is never in a
module, so it can be called directly. But the md equivalent could be in a
module, so I need to be able to register a call back.
Does that help?
Thanks,
NeilBrown
On 2011-04-11 12:59, NeilBrown wrote:
> On Mon, 11 Apr 2011 11:19:58 +0200 Jens Axboe <[email protected]> wrote:
>
>> On 2011-04-11 06:50, NeilBrown wrote:
>
>>> The only explanation I can come up with is that very occasionally schedule on
>>> 2 separate cpus calls blk_flush_plug for the same task. I don't understand
>>> the scheduler nearly well enough to know if or how that can happen.
>>> However with this patch in place I can write to a RAID1 constantly for half
>>> an hour, and without it, the write rarely lasts for 3 minutes.
>>
>> Or perhaps if the request_fn blocks, that would be problematic. So the
>> patch is likely a good idea even for that case.
>>
>> I'll merge it, changing it to list_splice_init() as I think that would
>> be more clear.
>
> OK - though I'm not 100% the patch fixes the problem - just that it hides the
> symptom for me.
> I might try instrumenting the code a bit more and see if I can find exactly
> where it is re-entering flush_plug_list - as that seems to be what is
> happening.
It's definitely a good thing to add, to avoid the list fudging on
schedule. Whether it's your exact problem, I can't tell.
> And yeah - list_split_init is probably better. I just never remember exactly
> what list_split means and have to look it up every time, where as
> list_add/list_del are very clear to me.
splice, no split :-)
>>> From 687b189c02276887dd7d5b87a817da9f67ed3c2c Mon Sep 17 00:00:00 2001
>>> From: NeilBrown <[email protected]>
>>> Date: Thu, 7 Apr 2011 13:16:59 +1000
>>> Subject: [PATCH] Enhance new plugging support to support general callbacks.
>>>
>>> md/raid requires an unplug callback, but as it does not uses
>>> requests the current code cannot provide one.
>>>
>>> So allow arbitrary callbacks to be attached to the blk_plug.
>>>
>>> Cc: Jens Axboe <[email protected]>
>>> Signed-off-by: NeilBrown <[email protected]>
>>> ---
>>> block/blk-core.c | 13 +++++++++++++
>>> include/linux/blkdev.h | 7 ++++++-
>>> 2 files changed, 19 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 725091d..273d60b 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -2644,6 +2644,7 @@ void blk_start_plug(struct blk_plug *plug)
>>>
>>> plug->magic = PLUG_MAGIC;
>>> INIT_LIST_HEAD(&plug->list);
>>> + INIT_LIST_HEAD(&plug->cb_list);
>>> plug->should_sort = 0;
>>>
>>> /*
>>> @@ -2717,9 +2718,21 @@ static void flush_plug_list(struct blk_plug *plug)
>>> local_irq_restore(flags);
>>> }
>>>
>>> +static void flush_plug_callbacks(struct blk_plug *plug)
>>> +{
>>> + while (!list_empty(&plug->cb_list)) {
>>> + struct blk_plug_cb *cb = list_first_entry(&plug->cb_list,
>>> + struct blk_plug_cb,
>>> + list);
>>> + list_del(&cb->list);
>>> + cb->callback(cb);
>>> + }
>>> +}
>>> +
>>> static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
>>> {
>>> flush_plug_list(plug);
>>> + flush_plug_callbacks(plug);
>>>
>>> if (plug == tsk->plug)
>>> tsk->plug = NULL;
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index 32176cc..3e5e604 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -857,8 +857,13 @@ extern void blk_put_queue(struct request_queue *);
>>> struct blk_plug {
>>> unsigned long magic;
>>> struct list_head list;
>>> + struct list_head cb_list;
>>> unsigned int should_sort;
>>> };
>>> +struct blk_plug_cb {
>>> + struct list_head list;
>>> + void (*callback)(struct blk_plug_cb *);
>>> +};
>>>
>>> extern void blk_start_plug(struct blk_plug *);
>>> extern void blk_finish_plug(struct blk_plug *);
>>> @@ -876,7 +881,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
>>> {
>>> struct blk_plug *plug = tsk->plug;
>>>
>>> - return plug && !list_empty(&plug->list);
>>> + return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
>>> }
>>>
>>> /*
>>
>> Maybe I'm missing something, but why do you need those callbacks? If
>> it's to use plugging yourself, perhaps we can just ensure that those
>> don't get assigned in the task - so it would be have to used with care.
>>
>> It's not that I disagree to these callbacks, I just want to ensure I
>> understand why you need them.
>>
>
> I'm sure one of us is missing something (probably both) but I'm not
> sure what.
>
> The callback is central.
>
> It is simply to use plugging in md.
> Just like blk-core, md will notice that a blk_plug is active and will put
> requests aside. I then need something to call in to md when blk_finish_plug
But this is done in __make_request(), so md devices should not be
affected at all. This is the part of your explanation that I do not
connect with the code.
If md itself is putting things on the plug list, why is it doing that?
> is called so that put-aside requests can be released.
> As md can be built as a module, that call must be a call-back of some sort.
> blk-core doesn't need to register blk_plug_flush because that is never in a
> module, so it can be called directly. But the md equivalent could be in a
> module, so I need to be able to register a call back.
>
> Does that help?
Not really. Is the problem that _you_ would like to stash things aside,
not the fact that __make_request() puts things on a task plug list?
--
Jens Axboe
On Mon, 11 Apr 2011 13:04:26 +0200 Jens Axboe <[email protected]> wrote:
> >
> > I'm sure one of us is missing something (probably both) but I'm not
> > sure what.
> >
> > The callback is central.
> >
> > It is simply to use plugging in md.
> > Just like blk-core, md will notice that a blk_plug is active and will put
> > requests aside. I then need something to call in to md when blk_finish_plug
>
> But this is done in __make_request(), so md devices should not be
> affected at all. This is the part of your explanation that I do not
> connect with the code.
>
> If md itself is putting things on the plug list, why is it doing that?
Yes. Exactly. md itself want to put things aside on some list.
e.g. in RAID1 when using a write-intent bitmap I want to gather as many write
requests as possible so I can update the bits for all of them at once.
So when a plug is in effect I just queue the bios somewhere and record the
bits that need to be set.
Then when the unplug happens I write out the bitmap updates in a single write
and when that completes, I write out the data (to all devices).
Also in RAID5 it is good if I can wait for lots of write request to arrive
before committing any of them to increase the possibility of getting a
full-stripe write.
Previously I used ->unplug_fn to release the queued requests. Now that has
gone I need a different way to register a callback when an unplug happens.
>
> > is called so that put-aside requests can be released.
> > As md can be built as a module, that call must be a call-back of some sort.
> > blk-core doesn't need to register blk_plug_flush because that is never in a
> > module, so it can be called directly. But the md equivalent could be in a
> > module, so I need to be able to register a call back.
> >
> > Does that help?
>
> Not really. Is the problem that _you_ would like to stash things aside,
> not the fact that __make_request() puts things on a task plug list?
>
Yes, exactly. I (in md) want to stash things aside.
(I don't actually put the stashed things on the blk_plug, though it might
make sense to do that later in some cases - I'm not sure. Currently I stash
things in my own internal lists and just need a call back to say "ok, flush
those lists now").
Thanks,
NeilBrown
On 2011-04-11 13:26, NeilBrown wrote:
> On Mon, 11 Apr 2011 13:04:26 +0200 Jens Axboe <[email protected]> wrote:
>
>>>
>>> I'm sure one of us is missing something (probably both) but I'm not
>>> sure what.
>>>
>>> The callback is central.
>>>
>>> It is simply to use plugging in md.
>>> Just like blk-core, md will notice that a blk_plug is active and will put
>>> requests aside. I then need something to call in to md when blk_finish_plug
>>
>> But this is done in __make_request(), so md devices should not be
>> affected at all. This is the part of your explanation that I do not
>> connect with the code.
>>
>> If md itself is putting things on the plug list, why is it doing that?
>
> Yes. Exactly. md itself want to put things aside on some list.
> e.g. in RAID1 when using a write-intent bitmap I want to gather as many write
> requests as possible so I can update the bits for all of them at once.
> So when a plug is in effect I just queue the bios somewhere and record the
> bits that need to be set.
> Then when the unplug happens I write out the bitmap updates in a single write
> and when that completes, I write out the data (to all devices).
>
> Also in RAID5 it is good if I can wait for lots of write request to arrive
> before committing any of them to increase the possibility of getting a
> full-stripe write.
>
> Previously I used ->unplug_fn to release the queued requests. Now that has
> gone I need a different way to register a callback when an unplug happens.
Ah, so this is what I was hinting at. But why use the task->plug for
that? Seems a bit counter intuitive. Why can't you just store these
internally?
>
>>
>>> is called so that put-aside requests can be released.
>>> As md can be built as a module, that call must be a call-back of some sort.
>>> blk-core doesn't need to register blk_plug_flush because that is never in a
>>> module, so it can be called directly. But the md equivalent could be in a
>>> module, so I need to be able to register a call back.
>>>
>>> Does that help?
>>
>> Not really. Is the problem that _you_ would like to stash things aside,
>> not the fact that __make_request() puts things on a task plug list?
>>
>
> Yes, exactly. I (in md) want to stash things aside.
>
> (I don't actually put the stashed things on the blk_plug, though it might
> make sense to do that later in some cases - I'm not sure. Currently I stash
> things in my own internal lists and just need a call back to say "ok, flush
> those lists now").
So we are making some progress... The thing I then don't understand is
why you want to make it associated with the plug? Seems you don't have
any scheduling restrictions, and in which case just storing them in md
seems like a much better option.
--
Jens Axboe
On Mon, 11 Apr 2011 20:59:28 +1000 NeilBrown <[email protected]> wrote:
> On Mon, 11 Apr 2011 11:19:58 +0200 Jens Axboe <[email protected]> wrote:
>
> > On 2011-04-11 06:50, NeilBrown wrote:
>
> > > The only explanation I can come up with is that very occasionally schedule on
> > > 2 separate cpus calls blk_flush_plug for the same task. I don't understand
> > > the scheduler nearly well enough to know if or how that can happen.
> > > However with this patch in place I can write to a RAID1 constantly for half
> > > an hour, and without it, the write rarely lasts for 3 minutes.
> >
> > Or perhaps if the request_fn blocks, that would be problematic. So the
> > patch is likely a good idea even for that case.
> >
> > I'll merge it, changing it to list_splice_init() as I think that would
> > be more clear.
>
> OK - though I'm not 100% the patch fixes the problem - just that it hides the
> symptom for me.
> I might try instrumenting the code a bit more and see if I can find exactly
> where it is re-entering flush_plug_list - as that seems to be what is
> happening.
OK, I found how it re-enters.
The request_fn doesn't exactly block, but when scsi_request_fn calls
spin_unlock_irq, this calls preempt_enable which can call schedule, which is
a recursive call.
The patch I provided will stop that from recursing again as the blk_plug.list
will be empty.
So it is almost what you suggested, however the request_fn doesn't block, it
just enabled preempt.
So the comment I would put at the top of that patch would be something like:
From: NeilBrown <[email protected]>
As request_fn called by __blk_run_queue is allowed to 'schedule()' (after
dropping the queue lock of course), it is possible to get a recursive call:
schedule -> blk_flush_plug -> __blk_finish_plug -> flush_plug_list
-> __blk_run_queue -> request_fn -> schedule
We must make sure that the second schedule does not call into blk_flush_plug
again. So instead of leaving the list of requests on blk_plug->list, move
them to a separate list leaving blk_plug->list empty.
Signed-off-by: NeilBrown <[email protected]>
Thanks,
NeilBrown
On Mon, 11 Apr 2011 13:37:20 +0200 Jens Axboe <[email protected]> wrote:
> On 2011-04-11 13:26, NeilBrown wrote:
> > On Mon, 11 Apr 2011 13:04:26 +0200 Jens Axboe <[email protected]> wrote:
> >
> >>>
> >>> I'm sure one of us is missing something (probably both) but I'm not
> >>> sure what.
> >>>
> >>> The callback is central.
> >>>
> >>> It is simply to use plugging in md.
> >>> Just like blk-core, md will notice that a blk_plug is active and will put
> >>> requests aside. I then need something to call in to md when blk_finish_plug
> >>
> >> But this is done in __make_request(), so md devices should not be
> >> affected at all. This is the part of your explanation that I do not
> >> connect with the code.
> >>
> >> If md itself is putting things on the plug list, why is it doing that?
> >
> > Yes. Exactly. md itself want to put things aside on some list.
> > e.g. in RAID1 when using a write-intent bitmap I want to gather as many write
> > requests as possible so I can update the bits for all of them at once.
> > So when a plug is in effect I just queue the bios somewhere and record the
> > bits that need to be set.
> > Then when the unplug happens I write out the bitmap updates in a single write
> > and when that completes, I write out the data (to all devices).
> >
> > Also in RAID5 it is good if I can wait for lots of write request to arrive
> > before committing any of them to increase the possibility of getting a
> > full-stripe write.
> >
> > Previously I used ->unplug_fn to release the queued requests. Now that has
> > gone I need a different way to register a callback when an unplug happens.
>
> Ah, so this is what I was hinting at. But why use the task->plug for
> that? Seems a bit counter intuitive. Why can't you just store these
> internally?
>
> >
> >>
> >>> is called so that put-aside requests can be released.
> >>> As md can be built as a module, that call must be a call-back of some sort.
> >>> blk-core doesn't need to register blk_plug_flush because that is never in a
> >>> module, so it can be called directly. But the md equivalent could be in a
> >>> module, so I need to be able to register a call back.
> >>>
> >>> Does that help?
> >>
> >> Not really. Is the problem that _you_ would like to stash things aside,
> >> not the fact that __make_request() puts things on a task plug list?
> >>
> >
> > Yes, exactly. I (in md) want to stash things aside.
> >
> > (I don't actually put the stashed things on the blk_plug, though it might
> > make sense to do that later in some cases - I'm not sure. Currently I stash
> > things in my own internal lists and just need a call back to say "ok, flush
> > those lists now").
>
> So we are making some progress... The thing I then don't understand is
> why you want to make it associated with the plug? Seems you don't have
> any scheduling restrictions, and in which case just storing them in md
> seems like a much better option.
>
Yes. But I need to know when to release the requests that I have stored.
I need to know when ->write_pages or ->read_pages or whatever has finished
submitting a pile of pages so that I can start processing the request that I
have put aside. So I need a callback from blk_finish_plug.
(and I also need to know if a thread that was plugging schedules for the same
reason that you do).
NeilBrown
On 2011-04-11 14:05, NeilBrown wrote:
> On Mon, 11 Apr 2011 13:37:20 +0200 Jens Axboe <[email protected]> wrote:
>
>> On 2011-04-11 13:26, NeilBrown wrote:
>>> On Mon, 11 Apr 2011 13:04:26 +0200 Jens Axboe <[email protected]> wrote:
>>>
>>>>>
>>>>> I'm sure one of us is missing something (probably both) but I'm not
>>>>> sure what.
>>>>>
>>>>> The callback is central.
>>>>>
>>>>> It is simply to use plugging in md.
>>>>> Just like blk-core, md will notice that a blk_plug is active and will put
>>>>> requests aside. I then need something to call in to md when blk_finish_plug
>>>>
>>>> But this is done in __make_request(), so md devices should not be
>>>> affected at all. This is the part of your explanation that I do not
>>>> connect with the code.
>>>>
>>>> If md itself is putting things on the plug list, why is it doing that?
>>>
>>> Yes. Exactly. md itself want to put things aside on some list.
>>> e.g. in RAID1 when using a write-intent bitmap I want to gather as many write
>>> requests as possible so I can update the bits for all of them at once.
>>> So when a plug is in effect I just queue the bios somewhere and record the
>>> bits that need to be set.
>>> Then when the unplug happens I write out the bitmap updates in a single write
>>> and when that completes, I write out the data (to all devices).
>>>
>>> Also in RAID5 it is good if I can wait for lots of write request to arrive
>>> before committing any of them to increase the possibility of getting a
>>> full-stripe write.
>>>
>>> Previously I used ->unplug_fn to release the queued requests. Now that has
>>> gone I need a different way to register a callback when an unplug happens.
>>
>> Ah, so this is what I was hinting at. But why use the task->plug for
>> that? Seems a bit counter intuitive. Why can't you just store these
>> internally?
>>
>>>
>>>>
>>>>> is called so that put-aside requests can be released.
>>>>> As md can be built as a module, that call must be a call-back of some sort.
>>>>> blk-core doesn't need to register blk_plug_flush because that is never in a
>>>>> module, so it can be called directly. But the md equivalent could be in a
>>>>> module, so I need to be able to register a call back.
>>>>>
>>>>> Does that help?
>>>>
>>>> Not really. Is the problem that _you_ would like to stash things aside,
>>>> not the fact that __make_request() puts things on a task plug list?
>>>>
>>>
>>> Yes, exactly. I (in md) want to stash things aside.
>>>
>>> (I don't actually put the stashed things on the blk_plug, though it might
>>> make sense to do that later in some cases - I'm not sure. Currently I stash
>>> things in my own internal lists and just need a call back to say "ok, flush
>>> those lists now").
>>
>> So we are making some progress... The thing I then don't understand is
>> why you want to make it associated with the plug? Seems you don't have
>> any scheduling restrictions, and in which case just storing them in md
>> seems like a much better option.
>>
>
> Yes. But I need to know when to release the requests that I have stored.
> I need to know when ->write_pages or ->read_pages or whatever has finished
> submitting a pile of pages so that I can start processing the request that I
> have put aside. So I need a callback from blk_finish_plug.
OK fair enough, I'll add your callback patch.
--
Jens Axboe
On 2011-04-11 13:55, NeilBrown wrote:
> On Mon, 11 Apr 2011 20:59:28 +1000 NeilBrown <[email protected]> wrote:
>
>> On Mon, 11 Apr 2011 11:19:58 +0200 Jens Axboe <[email protected]> wrote:
>>
>>> On 2011-04-11 06:50, NeilBrown wrote:
>>
>>>> The only explanation I can come up with is that very occasionally schedule on
>>>> 2 separate cpus calls blk_flush_plug for the same task. I don't understand
>>>> the scheduler nearly well enough to know if or how that can happen.
>>>> However with this patch in place I can write to a RAID1 constantly for half
>>>> an hour, and without it, the write rarely lasts for 3 minutes.
>>>
>>> Or perhaps if the request_fn blocks, that would be problematic. So the
>>> patch is likely a good idea even for that case.
>>>
>>> I'll merge it, changing it to list_splice_init() as I think that would
>>> be more clear.
>>
>> OK - though I'm not 100% the patch fixes the problem - just that it hides the
>> symptom for me.
>> I might try instrumenting the code a bit more and see if I can find exactly
>> where it is re-entering flush_plug_list - as that seems to be what is
>> happening.
>
> OK, I found how it re-enters.
>
> The request_fn doesn't exactly block, but when scsi_request_fn calls
> spin_unlock_irq, this calls preempt_enable which can call schedule, which is
> a recursive call.
>
> The patch I provided will stop that from recursing again as the blk_plug.list
> will be empty.
>
> So it is almost what you suggested, however the request_fn doesn't block, it
> just enabled preempt.
>
>
> So the comment I would put at the top of that patch would be something like:
Ah, so it was pretty close. That does explain it. I've already queued up
the patch, I'll ammend the commit message.
--
Jens Axboe
On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
> > Yes. But I need to know when to release the requests that I have stored.
> > I need to know when ->write_pages or ->read_pages or whatever has finished
> > submitting a pile of pages so that I can start processing the request that I
> > have put aside. So I need a callback from blk_finish_plug.
>
> OK fair enough, I'll add your callback patch.
>
Thanks. I'll queue up my md fixes to follow it once it gets to -linus.
NeilBrown
On 2011-04-11 14:36, NeilBrown wrote:
> On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
>
>>> Yes. But I need to know when to release the requests that I have stored.
>>> I need to know when ->write_pages or ->read_pages or whatever has finished
>>> submitting a pile of pages so that I can start processing the request that I
>>> have put aside. So I need a callback from blk_finish_plug.
>>
>> OK fair enough, I'll add your callback patch.
>>
>
> Thanks. I'll queue up my md fixes to follow it once it gets to -linus.
Great, once you do that and XFS kills the blk_flush_plug() calls too,
then we can remove that export and make it internal only.
--
Jens Axboe
On Mon, Apr 11, 2011 at 02:50:22PM +1000, NeilBrown wrote:
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 273d60b..903ce8d 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2674,19 +2674,23 @@ static void flush_plug_list(struct blk_plug *plug)
> struct request_queue *q;
> unsigned long flags;
> struct request *rq;
> + struct list_head head;
>
> BUG_ON(plug->magic != PLUG_MAGIC);
>
> if (list_empty(&plug->list))
> return;
> + list_add(&head, &plug->list);
> + list_del_init(&plug->list);
>
> if (plug->should_sort)
> - list_sort(NULL, &plug->list, plug_rq_cmp);
> + list_sort(NULL, &head, plug_rq_cmp);
> + plug->should_sort = 0;
As Jens mentioned this should be list_splice_init. But looking over
flush_plug_list the code there seems strange to me.
What does the local_irq_save in flush_plug_list protect? Why don't
we need it over the list_sort? And do we still need it when first
splicing the list to a local one?
It's one of these cases where I'd really like to see more comments
explaining why the code is doing what it's doing.
On Mon, 11 Apr 2011 12:59:23 -0400 "[email protected]" <[email protected]>
wrote:
> On Mon, Apr 11, 2011 at 02:50:22PM +1000, NeilBrown wrote:
> > diff --git a/block/blk-core.c b/block/blk-core.c
> > index 273d60b..903ce8d 100644
> > --- a/block/blk-core.c
> > +++ b/block/blk-core.c
> > @@ -2674,19 +2674,23 @@ static void flush_plug_list(struct blk_plug *plug)
> > struct request_queue *q;
> > unsigned long flags;
> > struct request *rq;
> > + struct list_head head;
> >
> > BUG_ON(plug->magic != PLUG_MAGIC);
> >
> > if (list_empty(&plug->list))
> > return;
> > + list_add(&head, &plug->list);
> > + list_del_init(&plug->list);
> >
> > if (plug->should_sort)
> > - list_sort(NULL, &plug->list, plug_rq_cmp);
> > + list_sort(NULL, &head, plug_rq_cmp);
> > + plug->should_sort = 0;
>
> As Jens mentioned this should be list_splice_init. But looking over
> flush_plug_list the code there seems strange to me.
>
> What does the local_irq_save in flush_plug_list protect? Why don't
> we need it over the list_sort? And do we still need it when first
> splicing the list to a local one?
>
> It's one of these cases where I'd really like to see more comments
> explaining why the code is doing what it's doing.
My understanding of that was that the calling requirement of
__elv_add_request is that the queue spinlock is held and that interrupts are
disabled.
So rather than possible enabling and disabling interrupts several times as
different queue are handled, the code just disabled interrupts once, and
then just take the spinlock once for each different queue.
The whole point of the change to plugging was to take locks less often.
Disabling interrupts less often is presumably an analogous goal.
Though I agree that a comment would help.
q = NULL;
+ /* Disable interrupts just once rather than using spin_lock_irq/sin_unlock_irq
* variants
*/
local_irq_save(flags);
assuming my analysis is correct.
NeilBrown
Looking at the patch
(http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=761e433f3de6fb8e369af9e5c08beb86286d023f)
I'm not sure it's an optimal design. The flush callback really
is a per-queue thing. Why isn't it a function pointer in the request
queue when doing the blk_run_queue call once we're done with a given
queue before moving on to the next one?
On Tue, Apr 12, 2011 at 07:14:28AM +1000, NeilBrown wrote:
>
> My understanding of that was that the calling requirement of
> __elv_add_request is that the queue spinlock is held and that interrupts are
> disabled.
> So rather than possible enabling and disabling interrupts several times as
> different queue are handled, the code just disabled interrupts once, and
> then just take the spinlock once for each different queue.
>
> The whole point of the change to plugging was to take locks less often.
> Disabling interrupts less often is presumably an analogous goal.
>
> Though I agree that a comment would help.
>
> q = NULL;
> + /* Disable interrupts just once rather than using spin_lock_irq/sin_unlock_irq
> * variants
> */
> local_irq_save(flags);
>
>
> assuming my analysis is correct.
Your explanation does make sense to be now that you explain it. I
didn't even thing of that variant before.
On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> Great, once you do that and XFS kills the blk_flush_plug() calls too,
> then we can remove that export and make it internal only.
Linus pulled the tree, so they are gone now. Btw, there's still some
bits in the area that confuse me:
- what's the point of the queue_sync_plugs? It has a lot of comment
that seem to pre-data the onstack plugging, but except for that
it's trivial wrapper around blk_flush_plug, with an argument
that is not used.
- is there a good reason for the existance of __blk_flush_plug? You'd
get one additional instruction in the inlined version of
blk_flush_plug when opencoding, but avoid the need for chained
function calls.
- Why is having a plug in blk_flush_plug marked unlikely? Note that
unlikely is the static branch prediction hint to mark the case
extremly unlikely and is even used for hot/cold partitioning. But
when we call it we usually check beforehand if we actually have
plugs, so it's actually likely to happen.
- what is the point of blk_finish_plug? All callers have
the plug on stack, and there's no good reason for adding the NULL
check. Note that blk_start_plug doesn't have the NULL check either.
- Why does __blk_flush_plug call __blk_finish_plug which might clear
tsk->plug, just to set it back after the call? When manually inlining
__blk_finish_plug ino __blk_flush_plug it looks like:
void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
{
flush_plug_list(plug);
if (plug == tsk->plug)
tsk->plug = NULL;
tsk->plug = plug;
}
it would seem much smarted to just call flush_plug_list directly.
In fact it seems like the tsk->plug is not nessecary at all and
all remaining __blk_flush_plug callers could be replaced with
flush_plug_list.
- and of course the remaining issue of why io_schedule needs an
expliciy blk_flush_plug when schedule() already does one in
case it actually needs to schedule.
On 2011-04-11 23:14, NeilBrown wrote:
> On Mon, 11 Apr 2011 12:59:23 -0400 "[email protected]" <[email protected]>
> wrote:
>
>> On Mon, Apr 11, 2011 at 02:50:22PM +1000, NeilBrown wrote:
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 273d60b..903ce8d 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -2674,19 +2674,23 @@ static void flush_plug_list(struct blk_plug *plug)
>>> struct request_queue *q;
>>> unsigned long flags;
>>> struct request *rq;
>>> + struct list_head head;
>>>
>>> BUG_ON(plug->magic != PLUG_MAGIC);
>>>
>>> if (list_empty(&plug->list))
>>> return;
>>> + list_add(&head, &plug->list);
>>> + list_del_init(&plug->list);
>>>
>>> if (plug->should_sort)
>>> - list_sort(NULL, &plug->list, plug_rq_cmp);
>>> + list_sort(NULL, &head, plug_rq_cmp);
>>> + plug->should_sort = 0;
>>
>> As Jens mentioned this should be list_splice_init. But looking over
>> flush_plug_list the code there seems strange to me.
>>
>> What does the local_irq_save in flush_plug_list protect? Why don't
>> we need it over the list_sort? And do we still need it when first
>> splicing the list to a local one?
>>
>> It's one of these cases where I'd really like to see more comments
>> explaining why the code is doing what it's doing.
>
> My understanding of that was that the calling requirement of
> __elv_add_request is that the queue spinlock is held and that interrupts are
> disabled.
> So rather than possible enabling and disabling interrupts several times as
> different queue are handled, the code just disabled interrupts once, and
> then just take the spinlock once for each different queue.
>
> The whole point of the change to plugging was to take locks less often.
> Disabling interrupts less often is presumably an analogous goal.
>
> Though I agree that a comment would help.
>
> q = NULL;
> + /* Disable interrupts just once rather than using spin_lock_irq/sin_unlock_irq
> * variants
> */
> local_irq_save(flags);
>
>
> assuming my analysis is correct.
Yep that is correct, it's to avoid juggling irq on and off for multiple
queues. I will put a comment there.
--
Jens Axboe
On 2011-04-12 00:58, [email protected] wrote:
> Looking at the patch
> (http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=761e433f3de6fb8e369af9e5c08beb86286d023f)
>
> I'm not sure it's an optimal design. The flush callback really
> is a per-queue thing. Why isn't it a function pointer in the request
> queue when doing the blk_run_queue call once we're done with a given
> queue before moving on to the next one?
I was thinking about this yesterday as well, the design didn't quite
feel just right. Additionally the user now must track this state too,
and whether he's plugged on that task or not.
I'll rewrite this.
--
Jens Axboe
On 2011-04-12 03:12, [email protected] wrote:
> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
>> then we can remove that export and make it internal only.
>
> Linus pulled the tree, so they are gone now. Btw, there's still some
> bits in the area that confuse me:
Great!
> - what's the point of the queue_sync_plugs? It has a lot of comment
> that seem to pre-data the onstack plugging, but except for that
> it's trivial wrapper around blk_flush_plug, with an argument
> that is not used.
There's really no point to it anymore. It's existance was due to the
older revision that had to track write requests for serializaing around
a barrier. I'll kill it, since we don't do that anymore.
> - is there a good reason for the existance of __blk_flush_plug? You'd
> get one additional instruction in the inlined version of
> blk_flush_plug when opencoding, but avoid the need for chained
> function calls.
> - Why is having a plug in blk_flush_plug marked unlikely? Note that
> unlikely is the static branch prediction hint to mark the case
> extremly unlikely and is even used for hot/cold partitioning. But
> when we call it we usually check beforehand if we actually have
> plugs, so it's actually likely to happen.
The existance and out-of-line is for the scheduler() hook. It should be
an unlikely event to schedule with a plug held, normally the plug should
have been explicitly unplugged before that happens.
> - what is the point of blk_finish_plug? All callers have
> the plug on stack, and there's no good reason for adding the NULL
> check. Note that blk_start_plug doesn't have the NULL check either.
That one can probably go, I need to double check that part since some
things changed.
> - Why does __blk_flush_plug call __blk_finish_plug which might clear
> tsk->plug, just to set it back after the call? When manually inlining
> __blk_finish_plug ino __blk_flush_plug it looks like:
>
> void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
> {
> flush_plug_list(plug);
> if (plug == tsk->plug)
> tsk->plug = NULL;
> tsk->plug = plug;
> }
>
> it would seem much smarted to just call flush_plug_list directly.
> In fact it seems like the tsk->plug is not nessecary at all and
> all remaining __blk_flush_plug callers could be replaced with
> flush_plug_list.
It depends on whether this was an explicit unplug (eg
blk_finish_plug()), or whether it was an implicit event (eg on
schedule()). If we do it on schedule(), then we retain the plug after
the flush. Otherwise we clear it.
> - and of course the remaining issue of why io_schedule needs an
> expliciy blk_flush_plug when schedule() already does one in
> case it actually needs to schedule.
Already answered in other email.
--
Jens Axboe
On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> On 2011-04-12 03:12, [email protected] wrote:
> > On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> >> Great, once you do that and XFS kills the blk_flush_plug() calls too,
> >> then we can remove that export and make it internal only.
> >
> > Linus pulled the tree, so they are gone now. Btw, there's still some
> > bits in the area that confuse me:
>
> Great!
>
> > - what's the point of the queue_sync_plugs? It has a lot of comment
> > that seem to pre-data the onstack plugging, but except for that
> > it's trivial wrapper around blk_flush_plug, with an argument
> > that is not used.
>
> There's really no point to it anymore. It's existance was due to the
> older revision that had to track write requests for serializaing around
> a barrier. I'll kill it, since we don't do that anymore.
>
> > - is there a good reason for the existance of __blk_flush_plug? You'd
> > get one additional instruction in the inlined version of
> > blk_flush_plug when opencoding, but avoid the need for chained
> > function calls.
> > - Why is having a plug in blk_flush_plug marked unlikely? Note that
> > unlikely is the static branch prediction hint to mark the case
> > extremly unlikely and is even used for hot/cold partitioning. But
> > when we call it we usually check beforehand if we actually have
> > plugs, so it's actually likely to happen.
>
> The existance and out-of-line is for the scheduler() hook. It should be
> an unlikely event to schedule with a plug held, normally the plug should
> have been explicitly unplugged before that happens.
Though if it does, haven't you just added a significant amount of
depth to the worst case stack usage? I'm seeing this sort of thing
from io_schedule():
Depth Size Location (40 entries)
----- ---- --------
0) 4256 16 mempool_alloc_slab+0x15/0x20
1) 4240 144 mempool_alloc+0x63/0x160
2) 4096 16 scsi_sg_alloc+0x4c/0x60
3) 4080 112 __sg_alloc_table+0x66/0x140
4) 3968 32 scsi_init_sgtable+0x33/0x90
5) 3936 48 scsi_init_io+0x31/0xc0
6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
7) 3856 112 sd_prep_fn+0x150/0xa90
8) 3744 48 blk_peek_request+0x6a/0x1f0
9) 3696 96 scsi_request_fn+0x60/0x510
10) 3600 32 __blk_run_queue+0x57/0x100
11) 3568 80 flush_plug_list+0x133/0x1d0
12) 3488 32 __blk_flush_plug+0x24/0x50
13) 3456 32 io_schedule+0x79/0x80
(This is from a page fault on ext3 that is doing page cache
readahead and blocking on a locked buffer.)
I've seen traces where mempool_alloc_slab enters direct reclaim
which adds another 1.5k of stack usage to this path. So I'm
extremely concerned that you've just reduced the stack available to
every thread by at least 2.5k of space...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 2011-04-12 14:22, Dave Chinner wrote:
> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
>> On 2011-04-12 03:12, [email protected] wrote:
>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
>>>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
>>>> then we can remove that export and make it internal only.
>>>
>>> Linus pulled the tree, so they are gone now. Btw, there's still some
>>> bits in the area that confuse me:
>>
>> Great!
>>
>>> - what's the point of the queue_sync_plugs? It has a lot of comment
>>> that seem to pre-data the onstack plugging, but except for that
>>> it's trivial wrapper around blk_flush_plug, with an argument
>>> that is not used.
>>
>> There's really no point to it anymore. It's existance was due to the
>> older revision that had to track write requests for serializaing around
>> a barrier. I'll kill it, since we don't do that anymore.
>>
>>> - is there a good reason for the existance of __blk_flush_plug? You'd
>>> get one additional instruction in the inlined version of
>>> blk_flush_plug when opencoding, but avoid the need for chained
>>> function calls.
>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
>>> unlikely is the static branch prediction hint to mark the case
>>> extremly unlikely and is even used for hot/cold partitioning. But
>>> when we call it we usually check beforehand if we actually have
>>> plugs, so it's actually likely to happen.
>>
>> The existance and out-of-line is for the scheduler() hook. It should be
>> an unlikely event to schedule with a plug held, normally the plug should
>> have been explicitly unplugged before that happens.
>
> Though if it does, haven't you just added a significant amount of
> depth to the worst case stack usage? I'm seeing this sort of thing
> from io_schedule():
>
> Depth Size Location (40 entries)
> ----- ---- --------
> 0) 4256 16 mempool_alloc_slab+0x15/0x20
> 1) 4240 144 mempool_alloc+0x63/0x160
> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
> 3) 4080 112 __sg_alloc_table+0x66/0x140
> 4) 3968 32 scsi_init_sgtable+0x33/0x90
> 5) 3936 48 scsi_init_io+0x31/0xc0
> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> 7) 3856 112 sd_prep_fn+0x150/0xa90
> 8) 3744 48 blk_peek_request+0x6a/0x1f0
> 9) 3696 96 scsi_request_fn+0x60/0x510
> 10) 3600 32 __blk_run_queue+0x57/0x100
> 11) 3568 80 flush_plug_list+0x133/0x1d0
> 12) 3488 32 __blk_flush_plug+0x24/0x50
> 13) 3456 32 io_schedule+0x79/0x80
>
> (This is from a page fault on ext3 that is doing page cache
> readahead and blocking on a locked buffer.)
>
> I've seen traces where mempool_alloc_slab enters direct reclaim
> which adds another 1.5k of stack usage to this path. So I'm
> extremely concerned that you've just reduced the stack available to
> every thread by at least 2.5k of space...
Yeah, that does not look great. If this turns out to be problematic, we
can turn the queue runs from the unlikely case into out-of-line from
kblockd.
But this really isn't that new, you could enter the IO dispatch path
when doing IO already (when submitting it). So we better be able to
handle that.
If it's a problem from the schedule()/io_schedule() path, then lets
ensure that those are truly unlikely events so we can punt them to
kblockd.
--
Jens Axboe
On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
> On 2011-04-12 14:22, Dave Chinner wrote:
> > On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> >> On 2011-04-12 03:12, [email protected] wrote:
> >>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> >>>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
> >>>> then we can remove that export and make it internal only.
> >>>
> >>> Linus pulled the tree, so they are gone now. Btw, there's still some
> >>> bits in the area that confuse me:
> >>
> >> Great!
> >>
> >>> - what's the point of the queue_sync_plugs? It has a lot of comment
> >>> that seem to pre-data the onstack plugging, but except for that
> >>> it's trivial wrapper around blk_flush_plug, with an argument
> >>> that is not used.
> >>
> >> There's really no point to it anymore. It's existance was due to the
> >> older revision that had to track write requests for serializaing around
> >> a barrier. I'll kill it, since we don't do that anymore.
> >>
> >>> - is there a good reason for the existance of __blk_flush_plug? You'd
> >>> get one additional instruction in the inlined version of
> >>> blk_flush_plug when opencoding, but avoid the need for chained
> >>> function calls.
> >>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
> >>> unlikely is the static branch prediction hint to mark the case
> >>> extremly unlikely and is even used for hot/cold partitioning. But
> >>> when we call it we usually check beforehand if we actually have
> >>> plugs, so it's actually likely to happen.
> >>
> >> The existance and out-of-line is for the scheduler() hook. It should be
> >> an unlikely event to schedule with a plug held, normally the plug should
> >> have been explicitly unplugged before that happens.
> >
> > Though if it does, haven't you just added a significant amount of
> > depth to the worst case stack usage? I'm seeing this sort of thing
> > from io_schedule():
> >
> > Depth Size Location (40 entries)
> > ----- ---- --------
> > 0) 4256 16 mempool_alloc_slab+0x15/0x20
> > 1) 4240 144 mempool_alloc+0x63/0x160
> > 2) 4096 16 scsi_sg_alloc+0x4c/0x60
> > 3) 4080 112 __sg_alloc_table+0x66/0x140
> > 4) 3968 32 scsi_init_sgtable+0x33/0x90
> > 5) 3936 48 scsi_init_io+0x31/0xc0
> > 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> > 7) 3856 112 sd_prep_fn+0x150/0xa90
> > 8) 3744 48 blk_peek_request+0x6a/0x1f0
> > 9) 3696 96 scsi_request_fn+0x60/0x510
> > 10) 3600 32 __blk_run_queue+0x57/0x100
> > 11) 3568 80 flush_plug_list+0x133/0x1d0
> > 12) 3488 32 __blk_flush_plug+0x24/0x50
> > 13) 3456 32 io_schedule+0x79/0x80
> >
> > (This is from a page fault on ext3 that is doing page cache
> > readahead and blocking on a locked buffer.)
> >
> > I've seen traces where mempool_alloc_slab enters direct reclaim
> > which adds another 1.5k of stack usage to this path. So I'm
> > extremely concerned that you've just reduced the stack available to
> > every thread by at least 2.5k of space...
>
> Yeah, that does not look great. If this turns out to be problematic, we
> can turn the queue runs from the unlikely case into out-of-line from
> kblockd.
>
> But this really isn't that new, you could enter the IO dispatch path
> when doing IO already (when submitting it). So we better be able to
> handle that.
The problem I see is that IO is submitted when there's plenty of
stack available whould have previously been fine. However now it
hits the plug, and then later on after the thread consumes a lot
more stack it, say, waits for a completion. We then schedule, it
unplugs the queue and we add the IO stack to a place where there
isn't much space available.
So effectively we are moving the places where stack is consumed
about, and it's complete unpredictable where that stack is going to
land now.
> If it's a problem from the schedule()/io_schedule() path, then
> lets ensure that those are truly unlikely events so we can punt
> them to kblockd.
Rather than wait for an explosion to be reported before doing this,
why not just punt unplugs to kblockd unconditionally?
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 2011-04-12 14:41, Dave Chinner wrote:
> On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
>> On 2011-04-12 14:22, Dave Chinner wrote:
>>> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
>>>> On 2011-04-12 03:12, [email protected] wrote:
>>>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
>>>>>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
>>>>>> then we can remove that export and make it internal only.
>>>>>
>>>>> Linus pulled the tree, so they are gone now. Btw, there's still some
>>>>> bits in the area that confuse me:
>>>>
>>>> Great!
>>>>
>>>>> - what's the point of the queue_sync_plugs? It has a lot of comment
>>>>> that seem to pre-data the onstack plugging, but except for that
>>>>> it's trivial wrapper around blk_flush_plug, with an argument
>>>>> that is not used.
>>>>
>>>> There's really no point to it anymore. It's existance was due to the
>>>> older revision that had to track write requests for serializaing around
>>>> a barrier. I'll kill it, since we don't do that anymore.
>>>>
>>>>> - is there a good reason for the existance of __blk_flush_plug? You'd
>>>>> get one additional instruction in the inlined version of
>>>>> blk_flush_plug when opencoding, but avoid the need for chained
>>>>> function calls.
>>>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
>>>>> unlikely is the static branch prediction hint to mark the case
>>>>> extremly unlikely and is even used for hot/cold partitioning. But
>>>>> when we call it we usually check beforehand if we actually have
>>>>> plugs, so it's actually likely to happen.
>>>>
>>>> The existance and out-of-line is for the scheduler() hook. It should be
>>>> an unlikely event to schedule with a plug held, normally the plug should
>>>> have been explicitly unplugged before that happens.
>>>
>>> Though if it does, haven't you just added a significant amount of
>>> depth to the worst case stack usage? I'm seeing this sort of thing
>>> from io_schedule():
>>>
>>> Depth Size Location (40 entries)
>>> ----- ---- --------
>>> 0) 4256 16 mempool_alloc_slab+0x15/0x20
>>> 1) 4240 144 mempool_alloc+0x63/0x160
>>> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
>>> 3) 4080 112 __sg_alloc_table+0x66/0x140
>>> 4) 3968 32 scsi_init_sgtable+0x33/0x90
>>> 5) 3936 48 scsi_init_io+0x31/0xc0
>>> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
>>> 7) 3856 112 sd_prep_fn+0x150/0xa90
>>> 8) 3744 48 blk_peek_request+0x6a/0x1f0
>>> 9) 3696 96 scsi_request_fn+0x60/0x510
>>> 10) 3600 32 __blk_run_queue+0x57/0x100
>>> 11) 3568 80 flush_plug_list+0x133/0x1d0
>>> 12) 3488 32 __blk_flush_plug+0x24/0x50
>>> 13) 3456 32 io_schedule+0x79/0x80
>>>
>>> (This is from a page fault on ext3 that is doing page cache
>>> readahead and blocking on a locked buffer.)
>>>
>>> I've seen traces where mempool_alloc_slab enters direct reclaim
>>> which adds another 1.5k of stack usage to this path. So I'm
>>> extremely concerned that you've just reduced the stack available to
>>> every thread by at least 2.5k of space...
>>
>> Yeah, that does not look great. If this turns out to be problematic, we
>> can turn the queue runs from the unlikely case into out-of-line from
>> kblockd.
>>
>> But this really isn't that new, you could enter the IO dispatch path
>> when doing IO already (when submitting it). So we better be able to
>> handle that.
>
> The problem I see is that IO is submitted when there's plenty of
> stack available whould have previously been fine. However now it
> hits the plug, and then later on after the thread consumes a lot
> more stack it, say, waits for a completion. We then schedule, it
> unplugs the queue and we add the IO stack to a place where there
> isn't much space available.
>
> So effectively we are moving the places where stack is consumed
> about, and it's complete unpredictable where that stack is going to
> land now.
Isn't that example fairly contrived? If we ended up doing the IO
dispatch before, then the only difference now is the stack usage of
schedule() itself. Apart from that, as far as I can tell, there should
not be much difference.
>> If it's a problem from the schedule()/io_schedule() path, then
>> lets ensure that those are truly unlikely events so we can punt
>> them to kblockd.
>
> Rather than wait for an explosion to be reported before doing this,
> why not just punt unplugs to kblockd unconditionally?
Supposedly it's faster to do it inline rather than punt the dispatch.
But that may actually not be true, if you have multiple plugs going (and
thus multiple contenders for the queue lock on dispatch). So lets play
it safe and punt to kblockd, we can always revisit this later.
diff --git a/block/blk-core.c b/block/blk-core.c
index c6eaa1f..36b1a75 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2665,7 +2665,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
static void queue_unplugged(struct request_queue *q, unsigned int depth)
{
trace_block_unplug_io(q, depth);
- __blk_run_queue(q, false);
+ __blk_run_queue(q, true);
if (q->unplugged_fn)
q->unplugged_fn(q);
--
Jens Axboe
On Tue, Apr 12, 2011 at 02:58:46PM +0200, Jens Axboe wrote:
> On 2011-04-12 14:41, Dave Chinner wrote:
> > On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
> >> On 2011-04-12 14:22, Dave Chinner wrote:
> >>> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> >>>> On 2011-04-12 03:12, [email protected] wrote:
> >>>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> >>>>>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
> >>>>>> then we can remove that export and make it internal only.
> >>>>>
> >>>>> Linus pulled the tree, so they are gone now. Btw, there's still some
> >>>>> bits in the area that confuse me:
> >>>>
> >>>> Great!
> >>>>
> >>>>> - what's the point of the queue_sync_plugs? It has a lot of comment
> >>>>> that seem to pre-data the onstack plugging, but except for that
> >>>>> it's trivial wrapper around blk_flush_plug, with an argument
> >>>>> that is not used.
> >>>>
> >>>> There's really no point to it anymore. It's existance was due to the
> >>>> older revision that had to track write requests for serializaing around
> >>>> a barrier. I'll kill it, since we don't do that anymore.
> >>>>
> >>>>> - is there a good reason for the existance of __blk_flush_plug? You'd
> >>>>> get one additional instruction in the inlined version of
> >>>>> blk_flush_plug when opencoding, but avoid the need for chained
> >>>>> function calls.
> >>>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
> >>>>> unlikely is the static branch prediction hint to mark the case
> >>>>> extremly unlikely and is even used for hot/cold partitioning. But
> >>>>> when we call it we usually check beforehand if we actually have
> >>>>> plugs, so it's actually likely to happen.
> >>>>
> >>>> The existance and out-of-line is for the scheduler() hook. It should be
> >>>> an unlikely event to schedule with a plug held, normally the plug should
> >>>> have been explicitly unplugged before that happens.
> >>>
> >>> Though if it does, haven't you just added a significant amount of
> >>> depth to the worst case stack usage? I'm seeing this sort of thing
> >>> from io_schedule():
> >>>
> >>> Depth Size Location (40 entries)
> >>> ----- ---- --------
> >>> 0) 4256 16 mempool_alloc_slab+0x15/0x20
> >>> 1) 4240 144 mempool_alloc+0x63/0x160
> >>> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
> >>> 3) 4080 112 __sg_alloc_table+0x66/0x140
> >>> 4) 3968 32 scsi_init_sgtable+0x33/0x90
> >>> 5) 3936 48 scsi_init_io+0x31/0xc0
> >>> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> >>> 7) 3856 112 sd_prep_fn+0x150/0xa90
> >>> 8) 3744 48 blk_peek_request+0x6a/0x1f0
> >>> 9) 3696 96 scsi_request_fn+0x60/0x510
> >>> 10) 3600 32 __blk_run_queue+0x57/0x100
> >>> 11) 3568 80 flush_plug_list+0x133/0x1d0
> >>> 12) 3488 32 __blk_flush_plug+0x24/0x50
> >>> 13) 3456 32 io_schedule+0x79/0x80
> >>>
> >>> (This is from a page fault on ext3 that is doing page cache
> >>> readahead and blocking on a locked buffer.)
> >>>
> >>> I've seen traces where mempool_alloc_slab enters direct reclaim
> >>> which adds another 1.5k of stack usage to this path. So I'm
> >>> extremely concerned that you've just reduced the stack available to
> >>> every thread by at least 2.5k of space...
> >>
> >> Yeah, that does not look great. If this turns out to be problematic, we
> >> can turn the queue runs from the unlikely case into out-of-line from
> >> kblockd.
> >>
> >> But this really isn't that new, you could enter the IO dispatch path
> >> when doing IO already (when submitting it). So we better be able to
> >> handle that.
> >
> > The problem I see is that IO is submitted when there's plenty of
> > stack available whould have previously been fine. However now it
> > hits the plug, and then later on after the thread consumes a lot
> > more stack it, say, waits for a completion. We then schedule, it
> > unplugs the queue and we add the IO stack to a place where there
> > isn't much space available.
> >
> > So effectively we are moving the places where stack is consumed
> > about, and it's complete unpredictable where that stack is going to
> > land now.
>
> Isn't that example fairly contrived?
I don't think so. e.g. in the XFS allocation path we do btree block
readahead, then go do the real work. The real work can end up with a
deeper stack before blocking on locks or completions unrelated to
the readahead, leading to schedule() being called and an unplug
being issued at that point. You might think it contrived, but if
you can't provide a guarantee that it can't happen then I have to
assume it will happen.
My concern is that we're already under stack space stress in the
writeback path, so anything that has the potential to increase it
significantly is a major worry from my point of view...
> If we ended up doing the IO
> dispatch before, then the only difference now is the stack usage of
> schedule() itself. Apart from that, as far as I can tell, there should
> not be much difference.
There's a difference between IO submission and IO dispatch. IO
submission is submit_bio thru to the plug; IO dispatch is from the
plug down to the disk. If they happen at the same place, there's no
problem. If IO dispatch is moved to schedule() via a plug....
> >> If it's a problem from the schedule()/io_schedule() path, then
> >> lets ensure that those are truly unlikely events so we can punt
> >> them to kblockd.
> >
> > Rather than wait for an explosion to be reported before doing this,
> > why not just punt unplugs to kblockd unconditionally?
>
> Supposedly it's faster to do it inline rather than punt the dispatch.
> But that may actually not be true, if you have multiple plugs going (and
> thus multiple contenders for the queue lock on dispatch). So lets play
> it safe and punt to kblockd, we can always revisit this later.
It's always best to play it safe when it comes to other peoples
data....
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
> On 2011-04-12 14:22, Dave Chinner wrote:
> > On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> >> On 2011-04-12 03:12, [email protected] wrote:
> >>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> >>> function calls.
> >>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
> >>> unlikely is the static branch prediction hint to mark the case
> >>> extremly unlikely and is even used for hot/cold partitioning. But
> >>> when we call it we usually check beforehand if we actually have
> >>> plugs, so it's actually likely to happen.
> >>
> >> The existance and out-of-line is for the scheduler() hook. It should be
> >> an unlikely event to schedule with a plug held, normally the plug should
> >> have been explicitly unplugged before that happens.
> >
> > Though if it does, haven't you just added a significant amount of
> > depth to the worst case stack usage? I'm seeing this sort of thing
> > from io_schedule():
> >
> > Depth Size Location (40 entries)
> > ----- ---- --------
> > 0) 4256 16 mempool_alloc_slab+0x15/0x20
> > 1) 4240 144 mempool_alloc+0x63/0x160
> > 2) 4096 16 scsi_sg_alloc+0x4c/0x60
> > 3) 4080 112 __sg_alloc_table+0x66/0x140
> > 4) 3968 32 scsi_init_sgtable+0x33/0x90
> > 5) 3936 48 scsi_init_io+0x31/0xc0
> > 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> > 7) 3856 112 sd_prep_fn+0x150/0xa90
> > 8) 3744 48 blk_peek_request+0x6a/0x1f0
> > 9) 3696 96 scsi_request_fn+0x60/0x510
> > 10) 3600 32 __blk_run_queue+0x57/0x100
> > 11) 3568 80 flush_plug_list+0x133/0x1d0
> > 12) 3488 32 __blk_flush_plug+0x24/0x50
> > 13) 3456 32 io_schedule+0x79/0x80
> >
> > (This is from a page fault on ext3 that is doing page cache
> > readahead and blocking on a locked buffer.)
FYI, the next step in the allocation chain adds >900 bytes to that
stack:
$ cat /sys/kernel/debug/tracing/stack_trace
Depth Size Location (47 entries)
----- ---- --------
0) 5176 40 zone_statistics+0xad/0xc0
1) 5136 288 get_page_from_freelist+0x2cf/0x840
2) 4848 304 __alloc_pages_nodemask+0x121/0x930
3) 4544 48 kmem_getpages+0x62/0x160
4) 4496 96 cache_grow+0x308/0x330
5) 4400 80 cache_alloc_refill+0x21c/0x260
6) 4320 64 kmem_cache_alloc+0x1b7/0x1e0
7) 4256 16 mempool_alloc_slab+0x15/0x20
8) 4240 144 mempool_alloc+0x63/0x160
9) 4096 16 scsi_sg_alloc+0x4c/0x60
10) 4080 112 __sg_alloc_table+0x66/0x140
11) 3968 32 scsi_init_sgtable+0x33/0x90
12) 3936 48 scsi_init_io+0x31/0xc0
13) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
14) 3856 112 sd_prep_fn+0x150/0xa90
15) 3744 48 blk_peek_request+0x6a/0x1f0
16) 3696 96 scsi_request_fn+0x60/0x510
17) 3600 32 __blk_run_queue+0x57/0x100
18) 3568 80 flush_plug_list+0x133/0x1d0
19) 3488 32 __blk_flush_plug+0x24/0x50
20) 3456 32 io_schedule+0x79/0x80
That's close to 1800 bytes now, and that's not entering the reclaim
path. If i get one deeper than that, I'll be sure to post it. :)
Cheers,
Dave.
--
Dave Chinner
[email protected]
On 2011-04-12 15:31, Dave Chinner wrote:
> On Tue, Apr 12, 2011 at 02:58:46PM +0200, Jens Axboe wrote:
>> On 2011-04-12 14:41, Dave Chinner wrote:
>>> On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
>>>> On 2011-04-12 14:22, Dave Chinner wrote:
>>>>> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
>>>>>> On 2011-04-12 03:12, [email protected] wrote:
>>>>>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
>>>>>>>> Great, once you do that and XFS kills the blk_flush_plug() calls too,
>>>>>>>> then we can remove that export and make it internal only.
>>>>>>>
>>>>>>> Linus pulled the tree, so they are gone now. Btw, there's still some
>>>>>>> bits in the area that confuse me:
>>>>>>
>>>>>> Great!
>>>>>>
>>>>>>> - what's the point of the queue_sync_plugs? It has a lot of comment
>>>>>>> that seem to pre-data the onstack plugging, but except for that
>>>>>>> it's trivial wrapper around blk_flush_plug, with an argument
>>>>>>> that is not used.
>>>>>>
>>>>>> There's really no point to it anymore. It's existance was due to the
>>>>>> older revision that had to track write requests for serializaing around
>>>>>> a barrier. I'll kill it, since we don't do that anymore.
>>>>>>
>>>>>>> - is there a good reason for the existance of __blk_flush_plug? You'd
>>>>>>> get one additional instruction in the inlined version of
>>>>>>> blk_flush_plug when opencoding, but avoid the need for chained
>>>>>>> function calls.
>>>>>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
>>>>>>> unlikely is the static branch prediction hint to mark the case
>>>>>>> extremly unlikely and is even used for hot/cold partitioning. But
>>>>>>> when we call it we usually check beforehand if we actually have
>>>>>>> plugs, so it's actually likely to happen.
>>>>>>
>>>>>> The existance and out-of-line is for the scheduler() hook. It should be
>>>>>> an unlikely event to schedule with a plug held, normally the plug should
>>>>>> have been explicitly unplugged before that happens.
>>>>>
>>>>> Though if it does, haven't you just added a significant amount of
>>>>> depth to the worst case stack usage? I'm seeing this sort of thing
>>>>> from io_schedule():
>>>>>
>>>>> Depth Size Location (40 entries)
>>>>> ----- ---- --------
>>>>> 0) 4256 16 mempool_alloc_slab+0x15/0x20
>>>>> 1) 4240 144 mempool_alloc+0x63/0x160
>>>>> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
>>>>> 3) 4080 112 __sg_alloc_table+0x66/0x140
>>>>> 4) 3968 32 scsi_init_sgtable+0x33/0x90
>>>>> 5) 3936 48 scsi_init_io+0x31/0xc0
>>>>> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
>>>>> 7) 3856 112 sd_prep_fn+0x150/0xa90
>>>>> 8) 3744 48 blk_peek_request+0x6a/0x1f0
>>>>> 9) 3696 96 scsi_request_fn+0x60/0x510
>>>>> 10) 3600 32 __blk_run_queue+0x57/0x100
>>>>> 11) 3568 80 flush_plug_list+0x133/0x1d0
>>>>> 12) 3488 32 __blk_flush_plug+0x24/0x50
>>>>> 13) 3456 32 io_schedule+0x79/0x80
>>>>>
>>>>> (This is from a page fault on ext3 that is doing page cache
>>>>> readahead and blocking on a locked buffer.)
>>>>>
>>>>> I've seen traces where mempool_alloc_slab enters direct reclaim
>>>>> which adds another 1.5k of stack usage to this path. So I'm
>>>>> extremely concerned that you've just reduced the stack available to
>>>>> every thread by at least 2.5k of space...
>>>>
>>>> Yeah, that does not look great. If this turns out to be problematic, we
>>>> can turn the queue runs from the unlikely case into out-of-line from
>>>> kblockd.
>>>>
>>>> But this really isn't that new, you could enter the IO dispatch path
>>>> when doing IO already (when submitting it). So we better be able to
>>>> handle that.
>>>
>>> The problem I see is that IO is submitted when there's plenty of
>>> stack available whould have previously been fine. However now it
>>> hits the plug, and then later on after the thread consumes a lot
>>> more stack it, say, waits for a completion. We then schedule, it
>>> unplugs the queue and we add the IO stack to a place where there
>>> isn't much space available.
>>>
>>> So effectively we are moving the places where stack is consumed
>>> about, and it's complete unpredictable where that stack is going to
>>> land now.
>>
>> Isn't that example fairly contrived?
>
> I don't think so. e.g. in the XFS allocation path we do btree block
> readahead, then go do the real work. The real work can end up with a
> deeper stack before blocking on locks or completions unrelated to
> the readahead, leading to schedule() being called and an unplug
> being issued at that point. You might think it contrived, but if
> you can't provide a guarantee that it can't happen then I have to
> assume it will happen.
If you ended up in lock_page() somewhere along the way, the path would
have been pretty much the same as it is now:
lock_page()
__lock_page()
__wait_on_bit_lock()
sync_page()
aops->sync_page();
block_sync_page()
__blk_run_backing_dev()
and the dispatch follows after that. If your schedules are only due to,
say, blocking on a mutex, then yes it'll be different. But is that
really the case?
I bet that worst case stack usage is exactly the same as before, and
that's the only metric we really care about.
> My concern is that we're already under stack space stress in the
> writeback path, so anything that has the potential to increase it
> significantly is a major worry from my point of view...
I agree on writeback being a worry, and that's why I made the change
(since it makes sense for other reasons, too). I just don't think we are
worse of than before.
>> If we ended up doing the IO
>> dispatch before, then the only difference now is the stack usage of
>> schedule() itself. Apart from that, as far as I can tell, there should
>> not be much difference.
>
> There's a difference between IO submission and IO dispatch. IO
> submission is submit_bio thru to the plug; IO dispatch is from the
> plug down to the disk. If they happen at the same place, there's no
> problem. If IO dispatch is moved to schedule() via a plug....
The IO submission can easily and non-deterministically turn into an IO
dispatch, so there's no real difference for the submitter. That was the
case before. With the explicit plug now, you _know_ that the IO
submission is only that and doesn't include IO dispatch. Not until you
schedule() or call blk_finish_plug(), both of which are events that you
can control.
>>>> If it's a problem from the schedule()/io_schedule() path, then
>>>> lets ensure that those are truly unlikely events so we can punt
>>>> them to kblockd.
>>>
>>> Rather than wait for an explosion to be reported before doing this,
>>> why not just punt unplugs to kblockd unconditionally?
>>
>> Supposedly it's faster to do it inline rather than punt the dispatch.
>> But that may actually not be true, if you have multiple plugs going (and
>> thus multiple contenders for the queue lock on dispatch). So lets play
>> it safe and punt to kblockd, we can always revisit this later.
>
> It's always best to play it safe when it comes to other peoples
> data....
Certainly, but so far I see no real evidence that this is in fact any
safer.
--
Jens Axboe
On 2011-04-12 15:40, Dave Chinner wrote:
> On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
>> On 2011-04-12 14:22, Dave Chinner wrote:
>>> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
>>>> On 2011-04-12 03:12, [email protected] wrote:
>>>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
>>>>> function calls.
>>>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
>>>>> unlikely is the static branch prediction hint to mark the case
>>>>> extremly unlikely and is even used for hot/cold partitioning. But
>>>>> when we call it we usually check beforehand if we actually have
>>>>> plugs, so it's actually likely to happen.
>>>>
>>>> The existance and out-of-line is for the scheduler() hook. It should be
>>>> an unlikely event to schedule with a plug held, normally the plug should
>>>> have been explicitly unplugged before that happens.
>>>
>>> Though if it does, haven't you just added a significant amount of
>>> depth to the worst case stack usage? I'm seeing this sort of thing
>>> from io_schedule():
>>>
>>> Depth Size Location (40 entries)
>>> ----- ---- --------
>>> 0) 4256 16 mempool_alloc_slab+0x15/0x20
>>> 1) 4240 144 mempool_alloc+0x63/0x160
>>> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
>>> 3) 4080 112 __sg_alloc_table+0x66/0x140
>>> 4) 3968 32 scsi_init_sgtable+0x33/0x90
>>> 5) 3936 48 scsi_init_io+0x31/0xc0
>>> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
>>> 7) 3856 112 sd_prep_fn+0x150/0xa90
>>> 8) 3744 48 blk_peek_request+0x6a/0x1f0
>>> 9) 3696 96 scsi_request_fn+0x60/0x510
>>> 10) 3600 32 __blk_run_queue+0x57/0x100
>>> 11) 3568 80 flush_plug_list+0x133/0x1d0
>>> 12) 3488 32 __blk_flush_plug+0x24/0x50
>>> 13) 3456 32 io_schedule+0x79/0x80
>>>
>>> (This is from a page fault on ext3 that is doing page cache
>>> readahead and blocking on a locked buffer.)
>
> FYI, the next step in the allocation chain adds >900 bytes to that
> stack:
>
> $ cat /sys/kernel/debug/tracing/stack_trace
> Depth Size Location (47 entries)
> ----- ---- --------
> 0) 5176 40 zone_statistics+0xad/0xc0
> 1) 5136 288 get_page_from_freelist+0x2cf/0x840
> 2) 4848 304 __alloc_pages_nodemask+0x121/0x930
> 3) 4544 48 kmem_getpages+0x62/0x160
> 4) 4496 96 cache_grow+0x308/0x330
> 5) 4400 80 cache_alloc_refill+0x21c/0x260
> 6) 4320 64 kmem_cache_alloc+0x1b7/0x1e0
> 7) 4256 16 mempool_alloc_slab+0x15/0x20
> 8) 4240 144 mempool_alloc+0x63/0x160
> 9) 4096 16 scsi_sg_alloc+0x4c/0x60
> 10) 4080 112 __sg_alloc_table+0x66/0x140
> 11) 3968 32 scsi_init_sgtable+0x33/0x90
> 12) 3936 48 scsi_init_io+0x31/0xc0
> 13) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> 14) 3856 112 sd_prep_fn+0x150/0xa90
> 15) 3744 48 blk_peek_request+0x6a/0x1f0
> 16) 3696 96 scsi_request_fn+0x60/0x510
> 17) 3600 32 __blk_run_queue+0x57/0x100
> 18) 3568 80 flush_plug_list+0x133/0x1d0
> 19) 3488 32 __blk_flush_plug+0x24/0x50
> 20) 3456 32 io_schedule+0x79/0x80
>
> That's close to 1800 bytes now, and that's not entering the reclaim
> path. If i get one deeper than that, I'll be sure to post it. :)
Do you have traces from 2.6.38, or are you just doing them now?
The path you quote above should not go into reclaim, it's a GFP_ATOMIC
allocation.
--
Jens Axboe
On Tue, Apr 12, 2011 at 03:45:52PM +0200, Jens Axboe wrote:
> On 2011-04-12 15:31, Dave Chinner wrote:
> > On Tue, Apr 12, 2011 at 02:58:46PM +0200, Jens Axboe wrote:
> >> On 2011-04-12 14:41, Dave Chinner wrote:
> >> Isn't that example fairly contrived?
> >
> > I don't think so. e.g. in the XFS allocation path we do btree block
> > readahead, then go do the real work. The real work can end up with a
> > deeper stack before blocking on locks or completions unrelated to
> > the readahead, leading to schedule() being called and an unplug
> > being issued at that point. You might think it contrived, but if
> > you can't provide a guarantee that it can't happen then I have to
> > assume it will happen.
>
> If you ended up in lock_page() somewhere along the way, the path would
> have been pretty much the same as it is now:
>
> lock_page()
> __lock_page()
> __wait_on_bit_lock()
> sync_page()
> aops->sync_page();
> block_sync_page()
> __blk_run_backing_dev()
>
> and the dispatch follows after that. If your schedules are only due to,
> say, blocking on a mutex, then yes it'll be different. But is that
> really the case?
XFS metadata IO does not use the page cache anymore, so won't take
that path - no page locks are taken during read or write. Even
before that change contending on page locks was extremely rare as
XFs uses the buffer container for synchronisation.
AFAICT, we have nothing that will cause plugs to be flushed until
scheduling occurs. In many cases it will be at the same points as
before (the explicit flushes XFS had), but there are going to be new
ones....
Like this:
0) 5360 40 zone_statistics+0xad/0xc0
1) 5320 288 get_page_from_freelist+0x2cf/0x840
2) 5032 304 __alloc_pages_nodemask+0x121/0x930
3) 4728 48 kmem_getpages+0x62/0x160
4) 4680 96 cache_grow+0x308/0x330
5) 4584 80 cache_alloc_refill+0x21c/0x260
6) 4504 16 __kmalloc+0x230/0x240
7) 4488 176 virtqueue_add_buf_gfp+0x1f9/0x3e0
8) 4312 144 do_virtblk_request+0x1f3/0x400
9) 4168 32 __blk_run_queue+0x57/0x100
10) 4136 80 flush_plug_list+0x133/0x1d0
11) 4056 32 __blk_flush_plug+0x24/0x50
12) 4024 160 schedule+0x867/0x9f0
13) 3864 208 schedule_timeout+0x1f5/0x2c0
14) 3656 144 wait_for_common+0xe7/0x190
15) 3512 16 wait_for_completion+0x1d/0x20
16) 3496 48 xfs_buf_iowait+0x36/0xb0
17) 3448 32 _xfs_buf_read+0x98/0xa0
18) 3416 48 xfs_buf_read+0xa2/0x100
19) 3368 80 xfs_trans_read_buf+0x1db/0x680
......
This path adds roughly 500 bytes to the previous case of
immediate dispatch of the IO down through _xfs_buf_read()...
> I bet that worst case stack usage is exactly the same as before, and
> that's the only metric we really care about.
I've already demonstrated much worse stack usage with ext3 through
the page fault path via io_schedule(). io_schedule() never used to
dispatch IO and now it does. Similarly there are changes and
increases in XFS stack usage like above. IMO, worst case stack
usage is definitely increased by these changes.
> > My concern is that we're already under stack space stress in the
> > writeback path, so anything that has the potential to increase it
> > significantly is a major worry from my point of view...
>
> I agree on writeback being a worry, and that's why I made the change
> (since it makes sense for other reasons, too). I just don't think we are
> worse of than before.
We certainly are.
Hmmm, I just noticed a new cumulative stack usage path through
direct reclaim - via congestion_wait() -> io_schedule()....
> >> If we ended up doing the IO
> >> dispatch before, then the only difference now is the stack usage of
> >> schedule() itself. Apart from that, as far as I can tell, there should
> >> not be much difference.
> >
> > There's a difference between IO submission and IO dispatch. IO
> > submission is submit_bio thru to the plug; IO dispatch is from the
> > plug down to the disk. If they happen at the same place, there's no
> > problem. If IO dispatch is moved to schedule() via a plug....
>
> The IO submission can easily and non-deterministically turn into an IO
> dispatch, so there's no real difference for the submitter. That was the
> case before. With the explicit plug now, you _know_ that the IO
> submission is only that and doesn't include IO dispatch.
You're violently agreeing with me that you've changed where the IO
dispatch path is run from. ;)
> Not until you
> schedule() or call blk_finish_plug(), both of which are events that you
> can control.
Well, not really - now taking any sleeping lock or waiting on
anything can trigger a plug flush where previously you had to
explicitly issue them. I'm not saying what we had is better, just
that there are implicit flushes with your changes that are
inherently uncontrollable...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Apr 12, 2011 at 02:58:46PM +0200, Jens Axboe wrote:
> Supposedly it's faster to do it inline rather than punt the dispatch.
> But that may actually not be true, if you have multiple plugs going (and
> thus multiple contenders for the queue lock on dispatch). So lets play
> it safe and punt to kblockd, we can always revisit this later.
Note that this can be optimized further by adding a new helper that just
queues up work on kblockd without taking the queue lock, e.g. adding a
new
void blk_run_queue_async(struct request_queue *q)
{
if (likely(!blk_queue_stopped(q)))
queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
}
And replacing all
__blk_run_queue(q, true);
callers with that, at which point they won't need the queuelock any
more.
On 2011-04-12 18:44, [email protected] wrote:
> On Tue, Apr 12, 2011 at 02:58:46PM +0200, Jens Axboe wrote:
>> Supposedly it's faster to do it inline rather than punt the dispatch.
>> But that may actually not be true, if you have multiple plugs going (and
>> thus multiple contenders for the queue lock on dispatch). So lets play
>> it safe and punt to kblockd, we can always revisit this later.
>
> Note that this can be optimized further by adding a new helper that just
> queues up work on kblockd without taking the queue lock, e.g. adding a
> new
>
> void blk_run_queue_async(struct request_queue *q)
> {
> if (likely(!blk_queue_stopped(q)))
> queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
> }
>
> And replacing all
>
> __blk_run_queue(q, true);
>
> callers with that, at which point they won't need the queuelock any
> more.
I realize that, in fact it's already safe as long as you pass in 'true'
for __blk_run_queue(). Before I had rewritten it to move the running
out, so that makes the trick a little difficult. This afternoon I also
tested it and saw no noticable difference, but I'll probably just do it
anyway as it makes sense.
--
Jens Axboe
On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> The existance and out-of-line is for the scheduler() hook. It should be
> an unlikely event to schedule with a plug held, normally the plug should
> have been explicitly unplugged before that happens.
I still don't think unlikely() is the right thing to do. The static
branch prediction hints cause a real massive slowdown if taken. For
things like this that happen during normal operation you're much better
off leaving the dynamic branch prediction in the CPU predicting what's
going on. And I don't think it's all that unlikely - e.g. for all the
metadata during readpages/writepages schedule/io_schedule will be
the unplugging point right now. I'll see if I can run an I/O workload
with Steve's likely/unlikely profiling turned on.
> > void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
> > {
> > flush_plug_list(plug);
> > if (plug == tsk->plug)
> > tsk->plug = NULL;
> > tsk->plug = plug;
> > }
> >
> > it would seem much smarted to just call flush_plug_list directly.
> > In fact it seems like the tsk->plug is not nessecary at all and
> > all remaining __blk_flush_plug callers could be replaced with
> > flush_plug_list.
>
> It depends on whether this was an explicit unplug (eg
> blk_finish_plug()), or whether it was an implicit event (eg on
> schedule()). If we do it on schedule(), then we retain the plug after
> the flush. Otherwise we clear it.
blk_finish_plug doesn't got through this codepath.
This is an untested patch how the area should look to me:
diff --git a/block/blk-core.c b/block/blk-core.c
index 90f22cc..6fa5ba1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2668,7 +2668,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
return !(rqa->q <= rqb->q);
}
-static void flush_plug_list(struct blk_plug *plug)
+void blk_flush_plug_list(struct blk_plug *plug)
{
struct request_queue *q;
unsigned long flags;
@@ -2716,29 +2716,16 @@ static void flush_plug_list(struct blk_plug *plug)
BUG_ON(!list_empty(&plug->list));
local_irq_restore(flags);
}
-
-static void __blk_finish_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
- flush_plug_list(plug);
-
- if (plug == tsk->plug)
- tsk->plug = NULL;
-}
+EXPORT_SYMBOL_GPL(blk_flush_plug_list);
void blk_finish_plug(struct blk_plug *plug)
{
- if (plug)
- __blk_finish_plug(current, plug);
+ blk_flush_plug_list(plug);
+ if (plug == current->plug)
+ current->plug = NULL;
}
EXPORT_SYMBOL(blk_finish_plug);
-void __blk_flush_plug(struct task_struct *tsk, struct blk_plug *plug)
-{
- __blk_finish_plug(tsk, plug);
- tsk->plug = plug;
-}
-EXPORT_SYMBOL(__blk_flush_plug);
-
int __init blk_dev_init(void)
{
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32176cc..fa6a4e1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -862,14 +862,14 @@ struct blk_plug {
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
-extern void __blk_flush_plug(struct task_struct *, struct blk_plug *);
+extern void blk_flush_plug_list(struct blk_plug *);
static inline void blk_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
- if (unlikely(plug))
- __blk_flush_plug(tsk, plug);
+ if (plug)
+ blk_flush_plug_list(plug);
}
static inline bool blk_needs_flush_plug(struct task_struct *tsk)
On Tue, Apr 12, 2011 at 06:49:53PM +0200, Jens Axboe wrote:
> I realize that, in fact it's already safe as long as you pass in 'true'
> for __blk_run_queue(). Before I had rewritten it to move the running
> out, so that makes the trick a little difficult. This afternoon I also
> tested it and saw no noticable difference, but I'll probably just do it
> anyway as it makes sense.
We still need the lock for __elv_add_request, so we'll need to keep the
logic anyway. But splitting out the just queue to kblockd case from
__blk_run_queue and giving the latter a sane prototype still sounds
like a good idea to me.
Btw, now that we don't call the request_fn directly any more and thus
can't block, can the unplugging be moved into the preempt notifiers?
On Tue, Apr 12, 2011 at 11:31:17PM +1000, Dave Chinner wrote:
> I don't think so. e.g. in the XFS allocation path we do btree block
> readahead, then go do the real work. The real work can end up with a
> deeper stack before blocking on locks or completions unrelated to
> the readahead, leading to schedule() being called and an unplug
> being issued at that point. You might think it contrived, but if
> you can't provide a guarantee that it can't happen then I have to
> assume it will happen.
In addition to the stack issue, which is a killer to this also has
latency implications. Before we could submit a synchronous metadata
read request inside readpage or writepage and kick it off to the disk
immediately, while now it won't get submitted until we block the next
time, i.e. have done some more work that could have been used for
doing I/O in the background. With the kblockd offload not only have
we spent more time but at the point where we finally kick it we
also need another context switch. It seem like we really need to
go through the filesystems and explicitly flush the plugging queue
for such cases. In fact a bio flag marking things as synchronous
metadata reads would help, but then again we need to clean up our
existing bio flags first..
On 2011-04-12 18:54, [email protected] wrote:
> On Tue, Apr 12, 2011 at 06:49:53PM +0200, Jens Axboe wrote:
>> I realize that, in fact it's already safe as long as you pass in 'true'
>> for __blk_run_queue(). Before I had rewritten it to move the running
>> out, so that makes the trick a little difficult. This afternoon I also
>> tested it and saw no noticable difference, but I'll probably just do it
>> anyway as it makes sense.
>
> We still need the lock for __elv_add_request, so we'll need to keep the
> logic anyway. But splitting out the just queue to kblockd case from
> __blk_run_queue and giving the latter a sane prototype still sounds
> like a good idea to me.
>
> Btw, now that we don't call the request_fn directly any more and thus
> can't block, can the unplugging be moved into the preempt notifiers?
It was only partly the reason, there's still the notice on preempt
(instead of schedule) and the runqueue lock problem. And if we allow
preempt, then we need to do disable preempt around all the plug logic.
--
Jens Axboe
On 2011-04-12 18:58, [email protected] wrote:
> On Tue, Apr 12, 2011 at 11:31:17PM +1000, Dave Chinner wrote:
>> I don't think so. e.g. in the XFS allocation path we do btree block
>> readahead, then go do the real work. The real work can end up with a
>> deeper stack before blocking on locks or completions unrelated to
>> the readahead, leading to schedule() being called and an unplug
>> being issued at that point. You might think it contrived, but if
>> you can't provide a guarantee that it can't happen then I have to
>> assume it will happen.
>
> In addition to the stack issue, which is a killer to this also has
> latency implications. Before we could submit a synchronous metadata
> read request inside readpage or writepage and kick it off to the disk
> immediately, while now it won't get submitted until we block the next
> time, i.e. have done some more work that could have been used for
> doing I/O in the background. With the kblockd offload not only have
> we spent more time but at the point where we finally kick it we
> also need another context switch. It seem like we really need to
> go through the filesystems and explicitly flush the plugging queue
> for such cases. In fact a bio flag marking things as synchronous
> metadata reads would help, but then again we need to clean up our
> existing bio flags first..
I think it would be a good idea to audit the SYNC cases, and if feasible
let that retain the 'immediate kick off' logic. If not, have some way to
signal that at least. Essentially allow some fine grained control of
what goes into the plug and what does not.
--
Jens Axboe
On Wed, 13 Apr 2011 00:34:52 +1000 Dave Chinner <[email protected]> wrote:
> On Tue, Apr 12, 2011 at 03:45:52PM +0200, Jens Axboe wrote:
> Not until you
> > schedule() or call blk_finish_plug(), both of which are events that you
> > can control.
>
> Well, not really - now taking any sleeping lock or waiting on
> anything can trigger a plug flush where previously you had to
> explicitly issue them. I'm not saying what we had is better, just
> that there are implicit flushes with your changes that are
> inherently uncontrollable...
It's not just sleeping locks - if preempt is enabled a schedule can happen at
any time - at any depth. I've seen a spin_unlock do it.
NeilBrown
On Tue, Apr 12, 2011 at 03:48:10PM +0200, Jens Axboe wrote:
> On 2011-04-12 15:40, Dave Chinner wrote:
> > On Tue, Apr 12, 2011 at 02:28:31PM +0200, Jens Axboe wrote:
> >> On 2011-04-12 14:22, Dave Chinner wrote:
> >>> On Tue, Apr 12, 2011 at 10:36:30AM +0200, Jens Axboe wrote:
> >>>> On 2011-04-12 03:12, [email protected] wrote:
> >>>>> On Mon, Apr 11, 2011 at 02:48:45PM +0200, Jens Axboe wrote:
> >>>>> function calls.
> >>>>> - Why is having a plug in blk_flush_plug marked unlikely? Note that
> >>>>> unlikely is the static branch prediction hint to mark the case
> >>>>> extremly unlikely and is even used for hot/cold partitioning. But
> >>>>> when we call it we usually check beforehand if we actually have
> >>>>> plugs, so it's actually likely to happen.
> >>>>
> >>>> The existance and out-of-line is for the scheduler() hook. It should be
> >>>> an unlikely event to schedule with a plug held, normally the plug should
> >>>> have been explicitly unplugged before that happens.
> >>>
> >>> Though if it does, haven't you just added a significant amount of
> >>> depth to the worst case stack usage? I'm seeing this sort of thing
> >>> from io_schedule():
> >>>
> >>> Depth Size Location (40 entries)
> >>> ----- ---- --------
> >>> 0) 4256 16 mempool_alloc_slab+0x15/0x20
> >>> 1) 4240 144 mempool_alloc+0x63/0x160
> >>> 2) 4096 16 scsi_sg_alloc+0x4c/0x60
> >>> 3) 4080 112 __sg_alloc_table+0x66/0x140
> >>> 4) 3968 32 scsi_init_sgtable+0x33/0x90
> >>> 5) 3936 48 scsi_init_io+0x31/0xc0
> >>> 6) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> >>> 7) 3856 112 sd_prep_fn+0x150/0xa90
> >>> 8) 3744 48 blk_peek_request+0x6a/0x1f0
> >>> 9) 3696 96 scsi_request_fn+0x60/0x510
> >>> 10) 3600 32 __blk_run_queue+0x57/0x100
> >>> 11) 3568 80 flush_plug_list+0x133/0x1d0
> >>> 12) 3488 32 __blk_flush_plug+0x24/0x50
> >>> 13) 3456 32 io_schedule+0x79/0x80
> >>>
> >>> (This is from a page fault on ext3 that is doing page cache
> >>> readahead and blocking on a locked buffer.)
> >
> > FYI, the next step in the allocation chain adds >900 bytes to that
> > stack:
> >
> > $ cat /sys/kernel/debug/tracing/stack_trace
> > Depth Size Location (47 entries)
> > ----- ---- --------
> > 0) 5176 40 zone_statistics+0xad/0xc0
> > 1) 5136 288 get_page_from_freelist+0x2cf/0x840
> > 2) 4848 304 __alloc_pages_nodemask+0x121/0x930
> > 3) 4544 48 kmem_getpages+0x62/0x160
> > 4) 4496 96 cache_grow+0x308/0x330
> > 5) 4400 80 cache_alloc_refill+0x21c/0x260
> > 6) 4320 64 kmem_cache_alloc+0x1b7/0x1e0
> > 7) 4256 16 mempool_alloc_slab+0x15/0x20
> > 8) 4240 144 mempool_alloc+0x63/0x160
> > 9) 4096 16 scsi_sg_alloc+0x4c/0x60
> > 10) 4080 112 __sg_alloc_table+0x66/0x140
> > 11) 3968 32 scsi_init_sgtable+0x33/0x90
> > 12) 3936 48 scsi_init_io+0x31/0xc0
> > 13) 3888 32 scsi_setup_fs_cmnd+0x79/0xe0
> > 14) 3856 112 sd_prep_fn+0x150/0xa90
> > 15) 3744 48 blk_peek_request+0x6a/0x1f0
> > 16) 3696 96 scsi_request_fn+0x60/0x510
> > 17) 3600 32 __blk_run_queue+0x57/0x100
> > 18) 3568 80 flush_plug_list+0x133/0x1d0
> > 19) 3488 32 __blk_flush_plug+0x24/0x50
> > 20) 3456 32 io_schedule+0x79/0x80
> >
> > That's close to 1800 bytes now, and that's not entering the reclaim
> > path. If i get one deeper than that, I'll be sure to post it. :)
>
> Do you have traces from 2.6.38, or are you just doing them now?
I do stack checks like this all the time. I generally don't keep
them around, just pay attention to the path and depth. ext3 is used
for / on my test VMs, and has never shown up as the worse case stack
usage when running xfstests. As of the block plugging code, this
trace is the top stack user for the first ~130 tests, and often for
the entire test run on XFS....
> The path you quote above should not go into reclaim, it's a GFP_ATOMIC
> allocation.
Right. I'm still trying to produce a trace that shows more stack
usage in the block layer. It's random chance as to what pops up most
of the time. However, some of the stacks that are showing up in
2.6.39 are quite different from any I've ever seen before...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Tue, Apr 12, 2011 at 2:08 PM, NeilBrown <[email protected]> wrote:
> On Wed, 13 Apr 2011 00:34:52 +1000 Dave Chinner <[email protected]> wrote:
>>
>> Well, not really - now taking any sleeping lock or waiting on
>> anything can trigger a plug flush where previously you had to
>> explicitly issue them. I'm not saying what we had is better, just
>> that there are implicit flushes with your changes that are
>> inherently uncontrollable...
>
> It's not just sleeping locks - if preempt is enabled a schedule can happen at
> any time - at any depth. ?I've seen a spin_unlock do it.
Hmm. I don't think we should flush IO in the preemption path. That
smells wrong on many levels, just one of them being the "any time, any
depth".
It also sounds really wrong from an IO pattern standpoint. The process
is actually still running, and the IO flushing _already_ does the
"only if it's going to sleep" test, but it actually does it _wrong_.
The "current->state" check doesn't make sense for a preemption event,
because it's not actually going to sleep there.
So a patch like the attached (UNTESTED!) sounds like the right thing to do.
Whether it makes any difference for any MD issues, who knows.. But
considering that the unplugging already used to test for "prev->state
!= TASK_RUNNING", this is absolutely the right thing to do - that old
test was just broken.
Linus
On Tue, 2011-04-12 at 19:23 -0700, Linus Torvalds wrote:
> kernel/sched.c | 20 ++++++++++----------
> 1 files changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 48013633d792..a187c3fe027b 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4111,20 +4111,20 @@ need_resched:
> try_to_wake_up_local(to_wakeup);
> }
> deactivate_task(rq, prev, DEQUEUE_SLEEP);
> +
> + /*
> + * If we are going to sleep and we have plugged IO queued, make
> + * sure to submit it to avoid deadlocks.
> + */
> + if (blk_needs_flush_plug(prev)) {
> + raw_spin_unlock(&rq->lock);
> + blk_flush_plug(prev);
> + raw_spin_lock(&rq->lock);
> + }
> }
> switch_count = &prev->nvcsw;
> }
>
> - /*
> - * If we are going to sleep and we have plugged IO queued, make
> - * sure to submit it to avoid deadlocks.
> - */
> - if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
> - raw_spin_unlock(&rq->lock);
> - blk_flush_plug(prev);
> - raw_spin_lock(&rq->lock);
> - }
> -
> pre_schedule(rq, prev);
>
> if (unlikely(!rq->nr_running))
Right, that cures the preemption problem. The reason I suggested placing
it where it was is that I'd like to keep all things that release
rq->lock in the middle of schedule() in one place, but I guess we can
cure that with some extra comments.
On 2011-04-13 13:12, Peter Zijlstra wrote:
> On Tue, 2011-04-12 at 19:23 -0700, Linus Torvalds wrote:
>> kernel/sched.c | 20 ++++++++++----------
>> 1 files changed, 10 insertions(+), 10 deletions(-)
>>
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index 48013633d792..a187c3fe027b 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -4111,20 +4111,20 @@ need_resched:
>> try_to_wake_up_local(to_wakeup);
>> }
>> deactivate_task(rq, prev, DEQUEUE_SLEEP);
>> +
>> + /*
>> + * If we are going to sleep and we have plugged IO queued, make
>> + * sure to submit it to avoid deadlocks.
>> + */
>> + if (blk_needs_flush_plug(prev)) {
>> + raw_spin_unlock(&rq->lock);
>> + blk_flush_plug(prev);
>> + raw_spin_lock(&rq->lock);
>> + }
>> }
>> switch_count = &prev->nvcsw;
>> }
>>
>> - /*
>> - * If we are going to sleep and we have plugged IO queued, make
>> - * sure to submit it to avoid deadlocks.
>> - */
>> - if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
>> - raw_spin_unlock(&rq->lock);
>> - blk_flush_plug(prev);
>> - raw_spin_lock(&rq->lock);
>> - }
>> -
>> pre_schedule(rq, prev);
>>
>> if (unlikely(!rq->nr_running))
>
> Right, that cures the preemption problem. The reason I suggested placing
> it where it was is that I'd like to keep all things that release
> rq->lock in the middle of schedule() in one place, but I guess we can
> cure that with some extra comments.
We definitely only want to do it on going to sleep, not preempt events.
So if you are fine with this change, then lets please do that.
Linus, I've got a few other things queued up in the area, I'll add this
and send them off soon. Or feel free to add this one yourself, since you
already did it.
--
Jens Axboe
On Wed, 2011-04-13 at 13:23 +0200, Jens Axboe wrote:
> We definitely only want to do it on going to sleep, not preempt events.
> So if you are fine with this change, then lets please do that.
Here's the Acked-by: Peter Zijlstra <[email protected]>, that goes
with it ;-)
> Linus, I've got a few other things queued up in the area, I'll add this
> and send them off soon. Or feel free to add this one yourself, since you
> already did it.
Right, please send it onwards or have Linus commit it himself and I'll
cook up a patch clarifying the rq->lock'ing mess around there.
On Wed, Apr 13, 2011 at 4:23 AM, Jens Axboe <[email protected]> wrote:
>
> Linus, I've got a few other things queued up in the area, I'll add this
> and send them off soon. Or feel free to add this one yourself, since you
> already did it.
Ok, I committed it with Peter's and your acks.
And if you already put it in your git tree too, git will merge it.
Linus
On 2011-04-13 17:13, Linus Torvalds wrote:
> On Wed, Apr 13, 2011 at 4:23 AM, Jens Axboe <[email protected]> wrote:
>>
>> Linus, I've got a few other things queued up in the area, I'll add this
>> and send them off soon. Or feel free to add this one yourself, since you
>> already did it.
>
> Ok, I committed it with Peter's and your acks.
Great, thanks.
> And if you already put it in your git tree too, git will merge it.
I did not, I had a feeling you'd merge this one.
--
Jens Axboe
Btw, "block: move queue run on unplug to kblockd" currently moves
the __blk_run_queue call to kblockd unconditionally currently. But
I'm not sure that's correct - if we do an explicit blk_finish_plug
there's no point in forcing the context switch.
On 2011-04-15 06:26, [email protected] wrote:
> Btw, "block: move queue run on unplug to kblockd" currently moves
> the __blk_run_queue call to kblockd unconditionally currently. But
> I'm not sure that's correct - if we do an explicit blk_finish_plug
> there's no point in forcing the context switch.
It's correct, but yes it's not optimal for the explicit unplug. Well I
think it really depends - for the single sync case, it's not ideal to
punt to kblockd. But if you have a bunch of threads doing IO, you
probably DO want to punt it to kblockd to avoid too many threads
hammering on the queue lock at the same time. Would need testing to be
sure, the below would a way to accomplish that.
diff --git a/block/blk-core.c b/block/blk-core.c
index b598fa7..995e995 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2662,16 +2662,16 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
return !(rqa->q <= rqb->q);
}
-static void queue_unplugged(struct request_queue *q, unsigned int depth)
+static void queue_unplugged(struct request_queue *q, unsigned int depth, bool run_from_wq)
{
trace_block_unplug_io(q, depth);
- __blk_run_queue(q, true);
+ __blk_run_queue(q, run_from_wq);
if (q->unplugged_fn)
q->unplugged_fn(q);
}
-void blk_flush_plug_list(struct blk_plug *plug)
+void blk_flush_plug_list(struct blk_plug *plug, bool run_from_wq)
{
struct request_queue *q;
unsigned long flags;
@@ -2706,7 +2706,7 @@ void blk_flush_plug_list(struct blk_plug *plug)
BUG_ON(!rq->q);
if (rq->q != q) {
if (q) {
- queue_unplugged(q, depth);
+ queue_unplugged(q, depth, run_from_wq);
spin_unlock(q->queue_lock);
}
q = rq->q;
@@ -2727,7 +2727,7 @@ void blk_flush_plug_list(struct blk_plug *plug)
}
if (q) {
- queue_unplugged(q, depth);
+ queue_unplugged(q, depth, run_from_wq);
spin_unlock(q->queue_lock);
}
@@ -2737,7 +2737,7 @@ EXPORT_SYMBOL(blk_flush_plug_list);
void blk_finish_plug(struct blk_plug *plug)
{
- blk_flush_plug_list(plug);
+ blk_flush_plug_list(plug, false);
if (plug == current->plug)
current->plug = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ffe48ff..1c76506 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -865,14 +865,14 @@ struct blk_plug {
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
-extern void blk_flush_plug_list(struct blk_plug *);
+extern void blk_flush_plug_list(struct blk_plug *, bool);
static inline void blk_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
if (plug)
- blk_flush_plug_list(plug);
+ blk_flush_plug_list(plug, true);
}
static inline bool blk_needs_flush_plug(struct task_struct *tsk)
--
Jens Axboe
On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
> > Yes. But I need to know when to release the requests that I have stored.
> > I need to know when ->write_pages or ->read_pages or whatever has finished
> > submitting a pile of pages so that I can start processing the request that I
> > have put aside. So I need a callback from blk_finish_plug.
>
> OK fair enough, I'll add your callback patch.
>
But you didn't did you? You added a completely different patch which is
completely pointless.
If you don't like my patch I would really prefer you said so rather than
silently replace it with something completely different (and broken).
I'll try to explain again.
md does not use __make_request. At all.
md does not use 'struct request'. At all.
The 'list' in 'struct blk_plug' is a list of 'struct request'.
Therefore md cannot put anything useful on the list in 'struct blk_plug'.
So when blk_flush_plug_list calls queue_unplugged() on a queue that belonged
to a request found on the blk_plug list, that queue cannot possibly ever be
for an 'md' device (because no 'struct request' ever belongs to an md device,
because md doesn't not use 'struct request').
So your patch (commit f75664570d8b) doesn't help MD at all.
For md, I need to attach something to blk_plug which somehow identifies an md
device, so that blk_finish_plug can get to that device and let it unplug.
The most sensible thing to have is a completely generic callback. That way
different block devices (which choose not to use __make_request) can attach
different sorts of things to blk_plug.
So can we please have my original patch applied? (Revised version using
list_splice_init included below).
Or if not, a clear explanation of why not?
Thanks,
NeilBrown
>From 6a2aa888b855fd298c174bcee130cf43db0b3f7b Mon Sep 17 00:00:00 2001
From: NeilBrown <[email protected]>
Date: Mon, 18 Apr 2011 08:15:45 +1000
Subject: [PATCH] Enhance new plugging support to support general callbacks.
md/raid requires an unplug callback, but as it does not uses
requests the current code cannot provide one.
So allow arbitrary callbacks to be attached to the blk_plug.
Cc: Jens Axboe <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
---
block/blk-core.c | 20 ++++++++++++++++++++
include/linux/blkdev.h | 7 ++++++-
2 files changed, 26 insertions(+), 1 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 78b7b0c..c2b8006 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2638,6 +2638,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->cb_list);
plug->should_sort = 0;
/*
@@ -2742,9 +2743,28 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
}
EXPORT_SYMBOL(blk_flush_plug_list);
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+ LIST_HEAD(callbacks);
+
+ if (list_empty(&plug->cb_list))
+ return;
+
+ list_splice_init(&plug->cb_list, &callbacks);
+
+ while (!list_empty(&callbacks)) {
+ struct blk_plug_cb *cb = list_first_entry(&callbacks,
+ struct blk_plug_cb,
+ list);
+ list_del(&cb->list);
+ cb->callback(cb);
+ }
+}
+
void blk_finish_plug(struct blk_plug *plug)
{
blk_flush_plug_list(plug, false);
+ flush_plug_callbacks(plug);
if (plug == current->plug)
current->plug = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec0357d..f3f7879 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,8 +860,13 @@ extern void blk_put_queue(struct request_queue *);
struct blk_plug {
unsigned long magic;
struct list_head list;
+ struct list_head cb_list;
unsigned int should_sort;
};
+struct blk_plug_cb {
+ struct list_head list;
+ void (*callback)(struct blk_plug_cb *);
+};
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
@@ -887,7 +892,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
- return plug && !list_empty(&plug->list);
+ return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
}
/*
--
1.7.3.4
On Mon, 18 Apr 2011 08:19:22 +1000 NeilBrown <[email protected]> wrote:
> So can we please have my original patch applied? (Revised version using
> list_splice_init included below).
I hadn't adjusted that one properly for the recent code shuffling.
This one is actually tested...
Thanks,
NeilBrown
>From 325b1c12b6165002022bd7b599f95c0331491cb3 Mon Sep 17 00:00:00 2001
From: NeilBrown <[email protected]>
Date: Mon, 18 Apr 2011 14:06:05 +1000
Subject: [PATCH] Enhance new plugging support to support general callbacks.
md/raid requires an unplug callback, but as it does not uses
requests the current code cannot provide one.
So allow arbitrary callbacks to be attached to the blk_plug.
Cc: Jens Axboe <[email protected]>
Signed-off-by: NeilBrown <[email protected]>
---
block/blk-core.c | 20 ++++++++++++++++++++
include/linux/blkdev.h | 7 ++++++-
2 files changed, 26 insertions(+), 1 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 78b7b0c..77edf05 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2638,6 +2638,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
+ INIT_LIST_HEAD(&plug->cb_list);
plug->should_sort = 0;
/*
@@ -2678,6 +2679,24 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
q->unplugged_fn(q);
}
+static void flush_plug_callbacks(struct blk_plug *plug)
+{
+ LIST_HEAD(callbacks);
+
+ if (list_empty(&plug->cb_list))
+ return;
+
+ list_splice_init(&plug->cb_list, &callbacks);
+
+ while (!list_empty(&callbacks)) {
+ struct blk_plug_cb *cb = list_first_entry(&callbacks,
+ struct blk_plug_cb,
+ list);
+ list_del(&cb->list);
+ cb->callback(cb);
+ }
+}
+
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request_queue *q;
@@ -2688,6 +2707,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(plug->magic != PLUG_MAGIC);
+ flush_plug_callbacks(plug);
if (list_empty(&plug->list))
return;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec0357d..f3f7879 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,8 +860,13 @@ extern void blk_put_queue(struct request_queue *);
struct blk_plug {
unsigned long magic;
struct list_head list;
+ struct list_head cb_list;
unsigned int should_sort;
};
+struct blk_plug_cb {
+ struct list_head list;
+ void (*callback)(struct blk_plug_cb *);
+};
extern void blk_start_plug(struct blk_plug *);
extern void blk_finish_plug(struct blk_plug *);
@@ -887,7 +892,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
- return plug && !list_empty(&plug->list);
+ return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
}
/*
--
1.7.3.4
On 2011-04-18 00:19, NeilBrown wrote:
> On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
>
>>> Yes. But I need to know when to release the requests that I have stored.
>>> I need to know when ->write_pages or ->read_pages or whatever has finished
>>> submitting a pile of pages so that I can start processing the request that I
>>> have put aside. So I need a callback from blk_finish_plug.
>>
>> OK fair enough, I'll add your callback patch.
>>
>
> But you didn't did you? You added a completely different patch which is
> completely pointless.
> If you don't like my patch I would really prefer you said so rather than
> silently replace it with something completely different (and broken).
First of all, you were CC'ed on all that discussion, yet didn't speak up
until now. This was last week. Secondly, please change your tone.
> I'll try to explain again.
>
> md does not use __make_request. At all.
> md does not use 'struct request'. At all.
>
> The 'list' in 'struct blk_plug' is a list of 'struct request'.
I'm well aware of how these facts, but thanks for bringing it up.
> Therefore md cannot put anything useful on the list in 'struct blk_plug'.
>
> So when blk_flush_plug_list calls queue_unplugged() on a queue that belonged
> to a request found on the blk_plug list, that queue cannot possibly ever be
> for an 'md' device (because no 'struct request' ever belongs to an md device,
> because md doesn't not use 'struct request').
>
> So your patch (commit f75664570d8b) doesn't help MD at all.
>
> For md, I need to attach something to blk_plug which somehow identifies an md
> device, so that blk_finish_plug can get to that device and let it unplug.
> The most sensible thing to have is a completely generic callback. That way
> different block devices (which choose not to use __make_request) can attach
> different sorts of things to blk_plug.
>
> So can we please have my original patch applied? (Revised version using
> list_splice_init included below).
>
> Or if not, a clear explanation of why not?
So correct me if I'm wrong here, but the _only_ real difference between
this patch and the current code in the tree, is the checking of the
callback list indicating a need to flush the callbacks. And that's
definitely an oversight. It should be functionally equivelant if md
would just flag this need to get a callback, eg instead of queueing a
callback on the list, just set plug->need_unplug from md instead of
queuing a callback and have blk_needs_flush_plug() do:
return plug && (!list_empty(&plug->list) || plug->need_unplug);
instead. Something like the below, completely untested.
diff --git a/block/blk-core.c b/block/blk-core.c
index 78b7b0c..e1f5635 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1305,12 +1305,12 @@ get_rq:
*/
if (list_empty(&plug->list))
trace_block_plug(q);
- else if (!plug->should_sort) {
+ else if (!(plug->flags & BLK_PLUG_F_SORT)) {
struct request *__rq;
__rq = list_entry_rq(plug->list.prev);
if (__rq->q != q)
- plug->should_sort = 1;
+ plug->flags |= BLK_PLUG_F_SORT;
}
/*
* Debug flag, kill later
@@ -2638,7 +2638,7 @@ void blk_start_plug(struct blk_plug *plug)
plug->magic = PLUG_MAGIC;
INIT_LIST_HEAD(&plug->list);
- plug->should_sort = 0;
+ plug->flags = 0;
/*
* If this is a nested plug, don't actually assign it. It will be
@@ -2693,9 +2693,9 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
list_splice_init(&plug->list, &list);
- if (plug->should_sort) {
+ if (plug->flags & BLK_PLUG_F_SORT) {
list_sort(NULL, &list, plug_rq_cmp);
- plug->should_sort = 0;
+ plug->flags &= ~BLK_PLUG_F_SORT;
}
q = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec0357d..1a0b76b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,7 +860,12 @@ extern void blk_put_queue(struct request_queue *);
struct blk_plug {
unsigned long magic;
struct list_head list;
- unsigned int should_sort;
+ unsigned int flags;
+};
+
+enum {
+ BLK_PLUG_F_SORT = 1,
+ BLK_PLUG_F_NEED_UNPLUG = 2,
};
extern void blk_start_plug(struct blk_plug *);
@@ -887,7 +892,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
- return plug && !list_empty(&plug->list);
+ return plug && (!list_empty(&plug->list) ||
+ (plug->flags & BLK_PLUG_F_NEED_UNPLUG));
}
/*
--
Jens Axboe
On Mon, 18 Apr 2011 08:38:24 +0200 Jens Axboe <[email protected]> wrote:
> On 2011-04-18 00:19, NeilBrown wrote:
> > On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
> >
> >>> Yes. But I need to know when to release the requests that I have stored.
> >>> I need to know when ->write_pages or ->read_pages or whatever has finished
> >>> submitting a pile of pages so that I can start processing the request that I
> >>> have put aside. So I need a callback from blk_finish_plug.
> >>
> >> OK fair enough, I'll add your callback patch.
> >>
> >
> > But you didn't did you? You added a completely different patch which is
> > completely pointless.
> > If you don't like my patch I would really prefer you said so rather than
> > silently replace it with something completely different (and broken).
>
> First of all, you were CC'ed on all that discussion, yet didn't speak up
> until now. This was last week. Secondly, please change your tone.
Yes, I was CC'ed on a discussion. In that discussion it was never mentioned
that you had completely changed the patch I sent you, and it never contained
the new patch in-line for review. Nothing that was discussed was
particularly relevant to md's needs so there was nothing to speak up about.
Yes- there were 'git pull' requests and I could have done a pull myself to
review the code but there seemed to be no urgency because you had already
agreed to apply my patch.
When I did finally pull the patches (after all the other issues had settle
down and I had time to finish of the RAID side) I found ... what I found.
I apologise for my tone, but I was very frustrated.
>
> > I'll try to explain again.
> >
> > md does not use __make_request. At all.
> > md does not use 'struct request'. At all.
> >
> > The 'list' in 'struct blk_plug' is a list of 'struct request'.
>
> I'm well aware of how these facts, but thanks for bringing it up.
>
> > Therefore md cannot put anything useful on the list in 'struct blk_plug'.
> >
> > So when blk_flush_plug_list calls queue_unplugged() on a queue that belonged
> > to a request found on the blk_plug list, that queue cannot possibly ever be
> > for an 'md' device (because no 'struct request' ever belongs to an md device,
> > because md doesn't not use 'struct request').
> >
> > So your patch (commit f75664570d8b) doesn't help MD at all.
> >
> > For md, I need to attach something to blk_plug which somehow identifies an md
> > device, so that blk_finish_plug can get to that device and let it unplug.
> > The most sensible thing to have is a completely generic callback. That way
> > different block devices (which choose not to use __make_request) can attach
> > different sorts of things to blk_plug.
> >
> > So can we please have my original patch applied? (Revised version using
> > list_splice_init included below).
> >
> > Or if not, a clear explanation of why not?
>
> So correct me if I'm wrong here, but the _only_ real difference between
> this patch and the current code in the tree, is the checking of the
> callback list indicating a need to flush the callbacks. And that's
> definitely an oversight. It should be functionally equivelant if md
> would just flag this need to get a callback, eg instead of queueing a
> callback on the list, just set plug->need_unplug from md instead of
> queuing a callback and have blk_needs_flush_plug() do:
>
> return plug && (!list_empty(&plug->list) || plug->need_unplug);
>
> instead. Something like the below, completely untested.
>
No, that is not the only real difference.
The real difference is that in the current code, md has no way to register
anything with a blk_plug because you can only register a 'struct request' on a
blk_plug, and md doesn't make any use of 'struct request'.
As I said in the Email you quote above:
> > Therefore md cannot put anything useful on the list in 'struct blk_plug'.
That is the heart of the problem.
NeilBrown
On 2011-04-18 09:25, NeilBrown wrote:
> On Mon, 18 Apr 2011 08:38:24 +0200 Jens Axboe <[email protected]> wrote:
>
>> On 2011-04-18 00:19, NeilBrown wrote:
>>> On Mon, 11 Apr 2011 14:11:58 +0200 Jens Axboe <[email protected]> wrote:
>>>
>>>>> Yes. But I need to know when to release the requests that I have stored.
>>>>> I need to know when ->write_pages or ->read_pages or whatever has finished
>>>>> submitting a pile of pages so that I can start processing the request that I
>>>>> have put aside. So I need a callback from blk_finish_plug.
>>>>
>>>> OK fair enough, I'll add your callback patch.
>>>>
>>>
>>> But you didn't did you? You added a completely different patch which is
>>> completely pointless.
>>> If you don't like my patch I would really prefer you said so rather than
>>> silently replace it with something completely different (and broken).
>>
>> First of all, you were CC'ed on all that discussion, yet didn't speak up
>> until now. This was last week. Secondly, please change your tone.
>
> Yes, I was CC'ed on a discussion. In that discussion it was never mentioned
> that you had completely changed the patch I sent you, and it never contained
> the new patch in-line for review. Nothing that was discussed was
> particularly relevant to md's needs so there was nothing to speak up about.
>
> Yes- there were 'git pull' requests and I could have done a pull myself to
> review the code but there seemed to be no urgency because you had already
> agreed to apply my patch.
> When I did finally pull the patches (after all the other issues had settle
> down and I had time to finish of the RAID side) I found ... what I found.
>
> I apologise for my tone, but I was very frustrated.
>
>>
>>> I'll try to explain again.
>>>
>>> md does not use __make_request. At all.
>>> md does not use 'struct request'. At all.
>>>
>>> The 'list' in 'struct blk_plug' is a list of 'struct request'.
>>
>> I'm well aware of how these facts, but thanks for bringing it up.
>>
>>> Therefore md cannot put anything useful on the list in 'struct blk_plug'.
>>>
>>> So when blk_flush_plug_list calls queue_unplugged() on a queue that belonged
>>> to a request found on the blk_plug list, that queue cannot possibly ever be
>>> for an 'md' device (because no 'struct request' ever belongs to an md device,
>>> because md doesn't not use 'struct request').
>>>
>>> So your patch (commit f75664570d8b) doesn't help MD at all.
>>>
>>> For md, I need to attach something to blk_plug which somehow identifies an md
>>> device, so that blk_finish_plug can get to that device and let it unplug.
>>> The most sensible thing to have is a completely generic callback. That way
>>> different block devices (which choose not to use __make_request) can attach
>>> different sorts of things to blk_plug.
>>>
>>> So can we please have my original patch applied? (Revised version using
>>> list_splice_init included below).
>>>
>>> Or if not, a clear explanation of why not?
>>
>> So correct me if I'm wrong here, but the _only_ real difference between
>> this patch and the current code in the tree, is the checking of the
>> callback list indicating a need to flush the callbacks. And that's
>> definitely an oversight. It should be functionally equivelant if md
>> would just flag this need to get a callback, eg instead of queueing a
>> callback on the list, just set plug->need_unplug from md instead of
>> queuing a callback and have blk_needs_flush_plug() do:
>>
>> return plug && (!list_empty(&plug->list) || plug->need_unplug);
>>
>> instead. Something like the below, completely untested.
>>
>
> No, that is not the only real difference.
>
> The real difference is that in the current code, md has no way to register
> anything with a blk_plug because you can only register a 'struct request' on a
> blk_plug, and md doesn't make any use of 'struct request'.
>
> As I said in the Email you quote above:
>
>>> Therefore md cannot put anything useful on the list in 'struct blk_plug'.
>
> That is the heart of the problem.
Hmm, I don't really see a way to avoid the list in that case. You really
do need some way to queue items, a single callback or flag or pointer
will not suffice.
I've added the patch and removed the (now) useless ->unplugged_fn
callback. I suggest you base your md changes on top of my for-linus
branch and tell me when you are confident it looks good, then I'll pull
in your MD changes and submit them later today.
OK with you?
--
Jens Axboe
[[NOTE to dm-devel people - one of the patches here remove some
now-unused code from dm-raid.c plus a declaration from device-mapper.h ]]]
On Mon, 18 Apr 2011 10:10:18 +0200 Jens Axboe <[email protected]> wrote:
> On 2011-04-18 09:25, NeilBrown wrote:
> >>> Therefore md cannot put anything useful on the list in 'struct blk_plug'.
> >
> > That is the heart of the problem.
>
> Hmm, I don't really see a way to avoid the list in that case. You really
> do need some way to queue items, a single callback or flag or pointer
> will not suffice.
>
> I've added the patch and removed the (now) useless ->unplugged_fn
> callback. I suggest you base your md changes on top of my for-linus
> branch and tell me when you are confident it looks good, then I'll pull
> in your MD changes and submit them later today.
>
> OK with you?
>
Yes, that's perfect. Thanks.
All of my plugging-related patches are now in a 'for-jens' branch:
The following changes since commit 99e22598e9a8e0a996d69c8c0f6b7027cb57720a:
block: drop queue lock before calling __blk_run_queue() for kblockd punt (2011-04-18 09:59:55 +0200)
are available in the git repository at:
git://neil.brown.name/md for-jens
NeilBrown (6):
md: use new plugging interface for RAID IO.
md/dm - remove remains of plug_fn callback.
md - remove old plugging code.
md: provide generic support for handling unplug callbacks.
md: incorporate new plugging into raid5.
md: fix up raid1/raid10 unplugging.
drivers/md/dm-raid.c | 8 ----
drivers/md/md.c | 87 +++++++++++++++++++++-------------------
drivers/md/md.h | 26 ++----------
drivers/md/raid1.c | 29 +++++++-------
drivers/md/raid10.c | 27 ++++++-------
drivers/md/raid5.c | 61 ++++++++++++----------------
drivers/md/raid5.h | 2 -
include/linux/device-mapper.h | 1 -
8 files changed, 103 insertions(+), 138 deletions(-)
Thanks,
NeilBrown
On 2011-04-18 10:33, NeilBrown wrote:
>
>
> [[NOTE to dm-devel people - one of the patches here remove some
> now-unused code from dm-raid.c plus a declaration from device-mapper.h ]]]
>
>
> On Mon, 18 Apr 2011 10:10:18 +0200 Jens Axboe <[email protected]> wrote:
>
>> On 2011-04-18 09:25, NeilBrown wrote:
>
>>>>> Therefore md cannot put anything useful on the list in 'struct blk_plug'.
>>>
>>> That is the heart of the problem.
>>
>> Hmm, I don't really see a way to avoid the list in that case. You really
>> do need some way to queue items, a single callback or flag or pointer
>> will not suffice.
>>
>> I've added the patch and removed the (now) useless ->unplugged_fn
>> callback. I suggest you base your md changes on top of my for-linus
>> branch and tell me when you are confident it looks good, then I'll pull
>> in your MD changes and submit them later today.
>>
>> OK with you?
>>
>
> Yes, that's perfect. Thanks.
>
> All of my plugging-related patches are now in a 'for-jens' branch:
>
> The following changes since commit 99e22598e9a8e0a996d69c8c0f6b7027cb57720a:
>
> block: drop queue lock before calling __blk_run_queue() for kblockd punt (2011-04-18 09:59:55 +0200)
>
> are available in the git repository at:
> git://neil.brown.name/md for-jens
>
> NeilBrown (6):
> md: use new plugging interface for RAID IO.
> md/dm - remove remains of plug_fn callback.
> md - remove old plugging code.
> md: provide generic support for handling unplug callbacks.
> md: incorporate new plugging into raid5.
> md: fix up raid1/raid10 unplugging.
>
> drivers/md/dm-raid.c | 8 ----
> drivers/md/md.c | 87 +++++++++++++++++++++-------------------
> drivers/md/md.h | 26 ++----------
> drivers/md/raid1.c | 29 +++++++-------
> drivers/md/raid10.c | 27 ++++++-------
> drivers/md/raid5.c | 61 ++++++++++++----------------
> drivers/md/raid5.h | 2 -
> include/linux/device-mapper.h | 1 -
> 8 files changed, 103 insertions(+), 138 deletions(-)
Great, thanks a lot Neil! It's pulled in now, will send the request to
Linus today.
--
Jens Axboe
Btw, I really start to wonder if the request level is the right place
to do this on-stack plugging. Wouldn't it be better to just plug
bios in the on-stack queue? That way we could also stop doing the
special case merging when adding to the plug list, and leave all the
merging / I/O schedule logic in the __make_request path. Probably
not .39 material, but worth a prototype?
Also what this dicussion brought up is that the block layer data
structures are highly confusing. Using a small subset of the
request_queue also for make_request based driver just doesn't make
sense. It seems like we should try to migrate the required state
to struct gendisk, and submit I/O through a block_device_ops.submit
method, leaving the request_queue as an internal abstraction for
the request based drivers.
On 04/18/2011 11:19 AM, [email protected] wrote:
> Btw, I really start to wonder if the request level is the right place
> to do this on-stack plugging. Wouldn't it be better to just plug
> bios in the on-stack queue? That way we could also stop doing the
> special case merging when adding to the plug list, and leave all the
> merging / I/O schedule logic in the __make_request path. Probably
> not .39 material, but worth a prototype?
>
> Also what this dicussion brought up is that the block layer data
> structures are highly confusing. Using a small subset of the
> request_queue also for make_request based driver just doesn't make
> sense. It seems like we should try to migrate the required state
> to struct gendisk, and submit I/O through a block_device_ops.submit
> method, leaving the request_queue as an internal abstraction for
> the request based drivers.
>
Good point.
It would also help us we the device-mapper redesign agk and myself
discussed at LSF. Having a block_device_ops.submit function would
allow us remap the actual request queue generically; and we would
even be able to address more than one request queue, which sounds
awfully similar to what Jens is trying to do ...
Cheers,
Hannes
--
Dr. Hannes Reinecke zSeries & Storage
[email protected] +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Markus Rex, HRB 16746 (AG N?rnberg)
On 2011-04-18 11:19, [email protected] wrote:
> Btw, I really start to wonder if the request level is the right place
> to do this on-stack plugging. Wouldn't it be better to just plug
> bios in the on-stack queue? That way we could also stop doing the
> special case merging when adding to the plug list, and leave all the
> merging / I/O schedule logic in the __make_request path. Probably
> not .39 material, but worth a prototype?
>
> Also what this dicussion brought up is that the block layer data
> structures are highly confusing. Using a small subset of the
> request_queue also for make_request based driver just doesn't make
> sense. It seems like we should try to migrate the required state
> to struct gendisk, and submit I/O through a block_device_ops.submit
> method, leaving the request_queue as an internal abstraction for
> the request based drivers.
Partially agree, I've never really liked the two methods we have where
the light light version was originally meant for stacked devices but
gets used elsewhere now too. It also causes IO scheduling problems, and
then you get things like request based dm to work around that.
But the idea is really to move more towards private queueing more
localized, the multiqueue setup will really apply well there too. I'm
trying to flesh out the design of that, ideally it would be nice to
unify the different bits we have now.
But agree on pulling the stacked bits into some lower part, like the
gendisk. It would clean that up nicely.
--
Jens Axboe
On 2011-04-18 11:40, Hannes Reinecke wrote:
> On 04/18/2011 11:19 AM, [email protected] wrote:
>> Btw, I really start to wonder if the request level is the right place
>> to do this on-stack plugging. Wouldn't it be better to just plug
>> bios in the on-stack queue? That way we could also stop doing the
>> special case merging when adding to the plug list, and leave all the
>> merging / I/O schedule logic in the __make_request path. Probably
>> not .39 material, but worth a prototype?
>>
>> Also what this dicussion brought up is that the block layer data
>> structures are highly confusing. Using a small subset of the
>> request_queue also for make_request based driver just doesn't make
>> sense. It seems like we should try to migrate the required state
>> to struct gendisk, and submit I/O through a block_device_ops.submit
>> method, leaving the request_queue as an internal abstraction for
>> the request based drivers.
>>
> Good point.
> It would also help us we the device-mapper redesign agk and myself
> discussed at LSF. Having a block_device_ops.submit function would
> allow us remap the actual request queue generically; and we would
> even be able to address more than one request queue, which sounds
> awfully similar to what Jens is trying to do ...
The multiqueue bits would still have one request_queue, but multiple
queueing structures (I called those blk_queue_ctx, iirc).
--
Jens Axboe
> NeilBrown (6):
> md: use new plugging interface for RAID IO.
> md/dm - remove remains of plug_fn callback.
> md - remove old plugging code.
> md: provide generic support for handling unplug callbacks.
> md: incorporate new plugging into raid5.
> md: fix up raid1/raid10 unplugging.
Looking over more of the unplugging left over, is there a reason to
keep the unplug_work bits in CFQ? They seem to rather counter the
current scheme (and it is the last user of kblockd outside of
blk-core.c)
> md: provide generic support for handling unplug callbacks.
This looks like some horribly ugly code to me. The real fix is to do
the plugging in the block layers for bios instead of requests. The
effect should be about the same, except that merging will become a
little easier as all bios will be on the list now when calling into
__make_request or it's equivalent, and even better if we extent the
list sort callback to also sort by the start block it will actually
simplify the merge algorithm a lot as it only needs to do front merges
and no back merges for the on-stack merging.
In addition it should also allow for much more optimal queue_lock
roundtrips - we can keep it locked at the end of what's currently
__make_request to have it available for the next bio that's been
on the list. If it either can be merged now that we have the lock
and/or we optimize get_request_wait not to sleep in the fast path
we could get down to a single queue_lock roundtrip for each unplug.
On Mon, 18 Apr 2011 17:30:48 -0400 "[email protected]" <[email protected]>
wrote:
> > md: provide generic support for handling unplug callbacks.
>
> This looks like some horribly ugly code to me. The real fix is to do
> the plugging in the block layers for bios instead of requests. The
> effect should be about the same, except that merging will become a
> little easier as all bios will be on the list now when calling into
> __make_request or it's equivalent, and even better if we extent the
> list sort callback to also sort by the start block it will actually
> simplify the merge algorithm a lot as it only needs to do front merges
> and no back merges for the on-stack merging.
>
> In addition it should also allow for much more optimal queue_lock
> roundtrips - we can keep it locked at the end of what's currently
> __make_request to have it available for the next bio that's been
> on the list. If it either can be merged now that we have the lock
> and/or we optimize get_request_wait not to sleep in the fast path
> we could get down to a single queue_lock roundtrip for each unplug.
Does the following match with your thinking? I'm trying to make for a more
concrete understanding...
- We change the ->make_request_fn interface so that it takes a list of
bios rather than a single bio - linked on ->bi_next.
These bios must all have the same ->bi_bdev. They *might* be sorted
by bi_sector (that needs to be decided).
- generic_make_request currently queues bios if there is already an active
request (this limits recursion). We enhance this to also queue requests
when code calls blk_start_plug.
In effect, generic_make_request becomes:
if (current->plug)
blk_add_to_plug(current->plug, bio);
else {
struct blk_plug plug;
blk_start_plug(&plug);
__generic_make_request(bio);
blk_finish_plug(&plug);
}
- __generic_make_request would sort the list of bios by bi_bdev (and maybe
bi_sector) and pass them along to the different ->make_request_fn
functions.
As there are likely to be only a few different bi_bdev values (often 1) but
hopefully lots and lots of bios it might be more efficient to do a linear
bucket sort based on bi_bdev, and only sort those buckets on bi_sector if
required.
Then make_request_fn handlers can expect to get lots of bios at once, can
optimise their handling as seems appropriate, and not require any further
plugging.
Is that at all close to what you are thinking?
NeilBrown
On Tue, Apr 19, 2011 at 08:38:13AM +1000, NeilBrown wrote:
> Is that at all close to what you are thinking?
Yes, pretty much like that.
On Mon, Apr 18, 2011 at 05:23:06PM -0400, [email protected] wrote:
> > NeilBrown (6):
> > md: use new plugging interface for RAID IO.
> > md/dm - remove remains of plug_fn callback.
> > md - remove old plugging code.
> > md: provide generic support for handling unplug callbacks.
> > md: incorporate new plugging into raid5.
> > md: fix up raid1/raid10 unplugging.
>
> Looking over more of the unplugging left over, is there a reason to
> keep the unplug_work bits in CFQ? They seem to rather counter the
> current scheme (and it is the last user of kblockd outside of
> blk-core.c)
Jens, Vivkek:
can you take a look at if cfq_schedule_dispatch is still needed in
new unplugging world order? It's the only kblockd user outside the
block core that's still left, and it seems rather odd to me at least.
On Fri, Apr 22, 2011 at 11:39:08AM -0400, [email protected] wrote:
> On Mon, Apr 18, 2011 at 05:23:06PM -0400, [email protected] wrote:
> > > NeilBrown (6):
> > > md: use new plugging interface for RAID IO.
> > > md/dm - remove remains of plug_fn callback.
> > > md - remove old plugging code.
> > > md: provide generic support for handling unplug callbacks.
> > > md: incorporate new plugging into raid5.
> > > md: fix up raid1/raid10 unplugging.
> >
> > Looking over more of the unplugging left over, is there a reason to
> > keep the unplug_work bits in CFQ? They seem to rather counter the
> > current scheme (and it is the last user of kblockd outside of
> > blk-core.c)
>
> Jens, Vivkek:
>
> can you take a look at if cfq_schedule_dispatch is still needed in
> new unplugging world order? It's the only kblockd user outside the
> block core that's still left, and it seems rather odd to me at least.
I guess cfq_schedule_dispatch() will still be required. One use case is
that CFQ might not dispatch requests to driver even if it has one (idling on
cfqq) and once the timer fires, it still need to be able to kick the queue
and dispatch requests.
To me this sounds independent of plugging logic. Or am I missing something.
Thanks
Vivek
On Fri, Apr 22, 2011 at 12:01:10PM -0400, Vivek Goyal wrote:
> On Fri, Apr 22, 2011 at 11:39:08AM -0400, [email protected] wrote:
> > On Mon, Apr 18, 2011 at 05:23:06PM -0400, [email protected] wrote:
> > > > NeilBrown (6):
> > > > md: use new plugging interface for RAID IO.
> > > > md/dm - remove remains of plug_fn callback.
> > > > md - remove old plugging code.
> > > > md: provide generic support for handling unplug callbacks.
> > > > md: incorporate new plugging into raid5.
> > > > md: fix up raid1/raid10 unplugging.
> > >
> > > Looking over more of the unplugging left over, is there a reason to
> > > keep the unplug_work bits in CFQ? They seem to rather counter the
> > > current scheme (and it is the last user of kblockd outside of
> > > blk-core.c)
> >
> > Jens, Vivkek:
> >
> > can you take a look at if cfq_schedule_dispatch is still needed in
> > new unplugging world order? It's the only kblockd user outside the
> > block core that's still left, and it seems rather odd to me at least.
>
> I guess cfq_schedule_dispatch() will still be required. One use case is
> that CFQ might not dispatch requests to driver even if it has one (idling on
> cfqq) and once the timer fires, it still need to be able to kick the queue
> and dispatch requests.
>
> To me this sounds independent of plugging logic. Or am I missing something.
I guess you question probably was that do we still need cfqd->unplug_work
and cfq_kick_queue() and can these be replaced by delay_work mechanism.
I think would think that we should be able to. Will wirte a patch and
test it.
Thanks
Vivek