__disk_unblock_events() will call queue_delayed_work() with a '0' argument
under a spin lock. This might cause the queue_work item to be executed
immediately, and run into a deadlock in disk_check_events() waiting for
the lock to be released.
This patch converts the 'blocked' counter into an atomic variable, so we don't
need to hold a spinlock anymore when scheduling the workqueue function.
Signed-off-by: Hannes Reinecke <[email protected]>
---
block/genhd.c | 36 +++++++++++++-----------------------
1 file changed, 13 insertions(+), 23 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index 9f8cb7beaad1..07e70f0c9c25 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1379,7 +1379,7 @@ struct disk_events {
spinlock_t lock;
struct mutex block_mutex; /* protects blocking */
- int block; /* event blocking depth */
+ atomic_t block; /* event blocking depth */
unsigned int pending; /* events already sent out */
unsigned int clearing; /* events being cleared */
@@ -1439,8 +1439,6 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
void disk_block_events(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
- unsigned long flags;
- bool cancel;
if (!ev)
return;
@@ -1451,11 +1449,7 @@ void disk_block_events(struct gendisk *disk)
*/
mutex_lock(&ev->block_mutex);
- spin_lock_irqsave(&ev->lock, flags);
- cancel = !ev->block++;
- spin_unlock_irqrestore(&ev->lock, flags);
-
- if (cancel)
+ if (atomic_inc_return(&ev->block) == 1)
cancel_delayed_work_sync(&disk->ev->dwork);
mutex_unlock(&ev->block_mutex);
@@ -1467,23 +1461,19 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
unsigned long intv;
unsigned long flags;
+ if (atomic_dec_return(&ev->block) <= 0) {
+ mutex_unlock(&ev->block_mutex);
+ return;
+ }
spin_lock_irqsave(&ev->lock, flags);
-
- if (WARN_ON_ONCE(ev->block <= 0))
- goto out_unlock;
-
- if (--ev->block)
- goto out_unlock;
-
intv = disk_events_poll_jiffies(disk);
+ spin_unlock_irqrestore(&ev->lock, flags);
if (check_now)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
else if (intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
-out_unlock:
- spin_unlock_irqrestore(&ev->lock, flags);
}
/**
@@ -1523,10 +1513,10 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
- if (!ev->block)
+ spin_unlock_irq(&ev->lock);
+ if (!atomic_read(&ev->block))
mod_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, 0);
- spin_unlock_irq(&ev->lock);
}
/**
@@ -1638,11 +1628,11 @@ static void disk_check_events(struct disk_events *ev,
*clearing_ptr &= ~clearing;
intv = disk_events_poll_jiffies(disk);
- if (!ev->block && intv)
+ spin_unlock_irq(&ev->lock);
+ if (!atomic_read(&ev->block) && intv)
queue_delayed_work(system_freezable_power_efficient_wq,
&ev->dwork, intv);
- spin_unlock_irq(&ev->lock);
/*
* Tell userland about new events. Only the events listed in
@@ -1807,7 +1797,7 @@ static void disk_alloc_events(struct gendisk *disk)
ev->disk = disk;
spin_lock_init(&ev->lock);
mutex_init(&ev->block_mutex);
- ev->block = 1;
+ atomic_set(&ev->block, 1);
ev->poll_msecs = -1;
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
@@ -1851,6 +1841,6 @@ static void disk_del_events(struct gendisk *disk)
static void disk_release_events(struct gendisk *disk)
{
/* the block count should be 1 from disk_del_events() */
- WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+ WARN_ON_ONCE(disk->ev && atomic_read(&disk->ev->block) != 1);
kfree(disk->ev);
}
--
2.29.2
On Tue, Jun 01, 2021 at 01:01:45PM +0200, Hannes Reinecke wrote:
> __disk_unblock_events() will call queue_delayed_work() with a '0' argument
> under a spin lock. This might cause the queue_work item to be executed
> immediately, and run into a deadlock in disk_check_events() waiting for
> the lock to be released.
Do you have lockdep warning on this 'deadlock'?
Executed immediately doesn't mean the work fn is run in the current
task context, and it is actually run in one wq worker(task) context, so
__queue_work() usually wakes up one wq worker for running the work fn,
then there shouldn't be the 'deadlock' you mentioned.
Thanks,
Ming
On 6/1/21 3:25 PM, Ming Lei wrote:
> On Tue, Jun 01, 2021 at 01:01:45PM +0200, Hannes Reinecke wrote:
>> __disk_unblock_events() will call queue_delayed_work() with a '0' argument
>> under a spin lock. This might cause the queue_work item to be executed
>> immediately, and run into a deadlock in disk_check_events() waiting for
>> the lock to be released.
>
> Do you have lockdep warning on this 'deadlock'?
>
> Executed immediately doesn't mean the work fn is run in the current
> task context, and it is actually run in one wq worker(task) context, so
> __queue_work() usually wakes up one wq worker for running the work fn,
> then there shouldn't be the 'deadlock' you mentioned.
>
That's what I thought, too, but then we have a customer report
complaining about a stuck installation, and this kernel message:
> [ 990.305908] INFO: task init:1 blocked for more than 491 seconds.
> [ 990.311904] Not tainted 5.3.18-22-default #1
> [ 990.316682] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
> [ 990.324483] init D 0 1 0 0x00000000
> [ 990.329950] Call Trace:
> [ 990.332396] __schedule+0x28a/0x6d0
> [ 990.335876] ? work_busy+0x80/0x80
> [ 990.339267] schedule+0x2f/0xa0
> [ 990.342399] schedule_timeout+0x1dd/0x300
> [ 990.346399] ? check_preempt_curr+0x29/0x80
> [ 990.350569] ? ttwu_do_wakeup+0x19/0x150
> [ 990.354480] ? work_busy+0x80/0x80
> [ 990.357869] wait_for_completion+0xba/0x140
> [ 990.362040] ? wake_up_q+0xa0/0xa0
> [ 990.365430] __flush_work+0x177/0x1d0
> [ 990.369080] ? worker_detach_from_pool+0xa0/0xa0
> [ 990.373682] __cancel_work_timer+0x12b/0x1b0
> [ 990.377940] ? exact_lock+0xd/0x20
> [ 990.381329] ? kobj_lookup+0x113/0x160
> [ 990.385067] disk_block_events+0x78/0x90
> [ 990.388979] __blkdev_get+0x6d/0x7e0
> [ 990.392542] ? blkdev_get_by_dev+0x40/0x40
> [ 990.396627] do_dentry_open+0x1ea/0x380
> [ 990.400450] path_openat+0x2fc/0x1520
> [ 990.404103] do_filp_open+0x9b/0x110
> [ 990.407667] ? kmem_cache_alloc+0x3d/0x250
> [ 990.411749] ? do_sys_open+0x1bd/0x250
> [ 990.415486] do_sys_open+0x1bd/0x250
> [ 990.419052] do_syscall_64+0x5b/0x1e0
> [ 990.422701] entry_SYSCALL_64_after_hwframe+0x44/0xa9
Which does vanish with this patch.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions Germany GmbH, 90409 Nürnberg
GF: F. Imendörffer, HRB 36809 (AG Nürnberg)
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 8107f5f540ff2700d9b300343ccbd3368970cf40 ("[PATCH] block/genhd: use atomic_t for disk_event->block")
url: https://github.com/0day-ci/linux/commits/Hannes-Reinecke/block-genhd-use-atomic_t-for-disk_event-block/20210601-190226
base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git for-next
in testcase: boot
on test machine: qemu-system-i386 -enable-kvm -cpu SandyBridge -smp 2 -m 4G
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+-----------------------------------------------+------------+------------+
| | 3fc19ec343 | 8107f5f540 |
+-----------------------------------------------+------------+------------+
| boot_successes | 10 | 0 |
| boot_failures | 0 | 6 |
| WARNING:bad_unlock_balance_detected | 0 | 6 |
| is_trying_to_release_lock(&ev->block_mutex)at | 0 | 6 |
| calltrace:__disk_unblock_events | 0 | 6 |
+-----------------------------------------------+------------+------------+
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 65.041606] WARNING: bad unlock balance detected!
[ 65.042098] 5.13.0-rc3-00039-g8107f5f540ff #1 Not tainted
[ 65.042660] -------------------------------------
[ 65.043157] kworker/u4:5/139 is trying to release lock (&ev->block_mutex) at:
[ 65.043913] __disk_unblock_events (kbuild/src/consumer/block/genhd.c:1477)
[ 65.044130] but there are no more locks to release!
[ 65.044130]
[ 65.044130] other info that might help us debug this:
[ 65.044130] 2 locks held by kworker/u4:5/139:
[ 65.044130] #0: 410078b8 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work (kbuild/src/consumer/kernel/workqueue.c:615 kbuild/src/consumer/kernel/workqueue.c:643 kbuild/src/consumer/kernel/workqueue.c:2246)
[ 65.044130] #1: 4264df34 ((work_completion)(&entry->work)){+.+.}-{0:0}, at: process_one_work (kbuild/src/consumer/kernel/workqueue.c:615 kbuild/src/consumer/kernel/workqueue.c:643 kbuild/src/consumer/kernel/workqueue.c:2246)
[ 65.044130]
[ 65.044130] stack backtrace:
[ 65.044130] CPU: 1 PID: 139 Comm: kworker/u4:5 Not tainted 5.13.0-rc3-00039-g8107f5f540ff #1
[ 65.044130] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
[ 65.044130] Workqueue: events_unbound async_run_entry_fn
[ 65.044130] Call Trace:
[ 65.044130] ? dump_stack (kbuild/src/consumer/lib/dump_stack.c:122)
[ 65.044130] ? __disk_unblock_events (kbuild/src/consumer/block/genhd.c:1477)
[ 65.052205] ? print_unlock_imbalance_bug.cold (kbuild/src/consumer/kernel/locking/lockdep.c:4955 kbuild/src/consumer/kernel/locking/lockdep.c:4931)
[ 65.052205] ? __disk_unblock_events (kbuild/src/consumer/block/genhd.c:1477)
[ 65.052205] ? lock_release (kbuild/src/consumer/kernel/locking/lockdep.c:5192 kbuild/src/consumer/kernel/locking/lockdep.c:5532)
[ 65.052205] ? __mutex_unlock_slowpath (kbuild/src/consumer/arch/x86/include/asm/atomic.h:29 kbuild/src/consumer/include/asm-generic/atomic-instrumented.h:28 kbuild/src/consumer/include/asm-generic/atomic-long.h:523 kbuild/src/consumer/kernel/locking/mutex.c:1238)
[ 65.052205] ? mutex_unlock (kbuild/src/consumer/kernel/locking/mutex.c:741)
[ 65.052205] ? __disk_unblock_events (kbuild/src/consumer/block/genhd.c:1477)
[ 65.052205] ? __device_add_disk (kbuild/src/consumer/block/genhd.c:565)
[ 65.052205] ? device_add_disk (kbuild/src/consumer/block/genhd.c:572)
[ 65.052205] ? do_floppy_init (kbuild/src/consumer/drivers/block/floppy.c:4704)
[ 65.052205] ? ktime_get (kbuild/src/consumer/kernel/time/timekeeping.c:831)
[ 65.052205] ? floppy_async_init (kbuild/src/consumer/drivers/block/floppy.c:4742)
[ 65.052205] ? async_run_entry_fn (kbuild/src/consumer/kernel/async.c:134)
[ 65.052205] ? process_one_work (kbuild/src/consumer/arch/x86/include/asm/atomic.h:29 kbuild/src/consumer/include/asm-generic/atomic-instrumented.h:28 kbuild/src/consumer/include/linux/jump_label.h:254 kbuild/src/consumer/include/linux/jump_label.h:264 kbuild/src/consumer/include/trace/events/workqueue.h:108 kbuild/src/consumer/kernel/workqueue.c:2280)
[ 65.052205] ? worker_thread (kbuild/src/consumer/include/linux/list.h:282 kbuild/src/consumer/kernel/workqueue.c:2422)
[ 65.052205] ? kthread (kbuild/src/consumer/kernel/kthread.c:313)
[ 65.052205] ? process_one_work (kbuild/src/consumer/kernel/workqueue.c:2364)
[ 65.052205] ? kthread_create_worker_on_cpu (kbuild/src/consumer/kernel/kthread.c:266)
[ 65.052205] ? ret_from_fork (kbuild/src/consumer/arch/x86/entry/entry_32.S:775)
[ 65.066846] loop: module loaded
[ 65.078122] null_blk: module loaded
[ 65.078673] Silicon Labs C2 port support v. 0.51.0 - (C) 2007 Rodolfo Giometti
[ 65.079489] c2port c2port0: C2 port uc added
[ 65.079958] c2port c2port0: uc flash has 30 blocks x 512 bytes (15360 bytes total)
[ 65.081010] usbcore: registered new interface driver rtsx_usb
[ 65.082303] usbcore: registered new interface driver pn533_usb
[ 65.082978] usbcore: registered new interface driver nfcmrvl
[ 65.083597] NCI uart driver 'nfcmrvl_uart [0]' registered
[ 65.084302] Uniform Multi-Platform E-IDE driver
[ 65.085233] ide_generic: please use "probe_mask=0x3f" module parameter for probing all legacy ISA IDE ports
[ 65.086270] ide-cd driver 5.00
[ 65.087896] SSFDC read-only Flash Translation layer
[ 65.088484] mtdoops: mtd device (mtddev=name/number) must be supplied
[ 65.089317] platform physmap-flash.0: failed to claim resource 0: [mem 0x08000000-0x07ffffff]
[ 65.090255] SBC-GXx flash: IO:0x258-0x259 MEM:0xdc000-0xdffff
[ 65.090961] slram: not enough parameters.
[ 65.091621] Driver 'dfl-spi-altera' was unable to register with bus_type 'dfl' because the bus was not initialized.
[ 65.095780] wireguard: allowedips self-tests: pass
[ 65.113505] wireguard: nonce counter self-tests: pass
[ 65.691142] wireguard: ratelimiter self-tests: pass
[ 65.691780] wireguard: WireGuard 1.0.0 loaded. See http://www.wireguard.com for information.
[ 65.692614] wireguard: Copyright (C) 2015-2019 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
[ 65.693599] eql: Equalizer2002: Simon Janes ([email protected]) and David S. Miller ([email protected])
[ 65.694895] MACsec IEEE 802.1AE
[ 65.696576] libphy: Fixed MDIO Bus: probed
[ 65.698059] usbcore: registered new interface driver mvusb_mdio
[ 65.698765] tun: Universal TUN/TAP device driver, 1.6
[ 65.699721] e1000: Intel(R) PRO/1000 Network Driver
[ 65.700291] e1000: Copyright (c) 1999-2006 Intel Corporation.
[ 65.870690] ACPI: _SB_.LNKC: Enabled at IRQ 11
[ 66.238500] e1000 0000:00:03.0 eth0: (PCI:33MHz:32-bit) 52:54:00:12:34:56
[ 66.239265] e1000 0000:00:03.0 eth0: Intel(R) PRO/1000 Network Connection
[ 66.240099] mkiss: AX.25 Multikiss, Hans Albas PE1AYX
[ 66.240719] AX.25: 6pack driver, Revision: 0.3.0
[ 66.241245] AX.25: bpqether driver version 004
[ 66.241704] baycom_ser_fdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA
[ 66.241704] baycom_ser_fdx: version 0.10
[ 66.243814] hdlcdrv: (C) 1996-2000 Thomas Sailer HB9JNX/AE4WA
[ 66.244511] hdlcdrv: version 0.8
[ 66.244882] baycom_ser_hdx: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA
[ 66.244882] baycom_ser_hdx: version 0.10
[ 66.246962] baycom_par: (C) 1996-2000 Thomas Sailer, HB9JNX/AE4WA
[ 66.246962] baycom_par: version 0.9
[ 66.249490] PPP generic driver version 2.4.2
[ 66.250116] PPP BSD Compression module registered
[ 66.250727] PPP MPPE Compression module registered
[ 66.251273] hdlc: HDLC support module revision 1.22
[ 66.251874] airo(): Probing for PCI adapters
[ 66.252564] airo(): Finished probing for PCI adapters
[ 66.253144] usbcore: registered new interface driver usb8xxx
[ 66.253752] libertas_spi: Libertas SPI driver
[ 66.254320] usbcore: registered new interface driver zd1201
[ 66.255021] usbcore: registered new interface driver rndis_wlan
[ 66.255744] usbcore: registered new interface driver kaweth
[ 66.256395] pegasus: v0.9.3 (2013/04/25), Pegasus/Pegasus II USB Ethernet driver
[ 66.257252] usbcore: registered new interface driver pegasus
[ 66.257925] usbcore: registered new interface driver rtl8150
[ 66.258615] usbcore: registered new interface driver r8152
[ 66.259237] hso: drivers/net/usb/hso.c: Option Wireless
[ 66.259929] usbcore: registered new interface driver hso
[ 66.260559] usbcore: registered new interface driver cdc_ether
[ 66.261214] usbcore: registered new interface driver cdc_eem
[ 66.261836] usbcore: registered new interface driver dm9601
To reproduce:
# build kernel
cd linux
cp config-5.13.0-rc3-00039-g8107f5f540ff .config
make HOSTCC=gcc-9 CC=gcc-9 ARCH=i386 olddefconfig prepare modules_prepare bzImage
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang