Currently, the kobject_del for kobjs of mq, hctx and ctx is invoked
under sysfs_lock, lock inversion will come up when other one is
acessing the associated sysfs file and trying to acquire the
sysfs_lock. To fix it, use mutex_trylock in blk_mq_sysfs_ops and
blk_mq_hw_sysfs_ops, if the lock in on contending, return -EAGAIN.
Signed-off-by: Jianchao Wang <[email protected]>
---
block/blk-mq-sysfs.c | 52 ++++++++++++++++++++++++++++++++--------------------
1 file changed, 32 insertions(+), 20 deletions(-)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index aafb442..210804f 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -53,11 +53,14 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
if (!entry->show)
return -EIO;
- res = -ENOENT;
- mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->show(ctx, page);
- mutex_unlock(&q->sysfs_lock);
+ res = -EAGAIN;
+ if (mutex_trylock(&q->sysfs_lock)) {
+ if (!blk_queue_dying(q))
+ res = entry->show(ctx, page);
+ else
+ res = -ENOENT;
+ mutex_unlock(&q->sysfs_lock);
+ }
return res;
}
@@ -76,11 +79,14 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
if (!entry->store)
return -EIO;
- res = -ENOENT;
- mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->store(ctx, page, length);
- mutex_unlock(&q->sysfs_lock);
+ res = -EAGAIN;
+ if (mutex_trylock(&q->sysfs_lock)) {
+ if (!blk_queue_dying(q))
+ res = entry->store(ctx, page, length);
+ else
+ res = -ENOENT;
+ mutex_unlock(&q->sysfs_lock);
+ }
return res;
}
@@ -99,11 +105,14 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
if (!entry->show)
return -EIO;
- res = -ENOENT;
- mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->show(hctx, page);
- mutex_unlock(&q->sysfs_lock);
+ res = -EAGAIN;
+ if (mutex_trylock(&q->sysfs_lock)) {
+ if (!blk_queue_dying(q))
+ res = entry->show(hctx, page);
+ else
+ res = -ENOENT;
+ mutex_unlock(&q->sysfs_lock);
+ }
return res;
}
@@ -123,11 +132,14 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
if (!entry->store)
return -EIO;
- res = -ENOENT;
- mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->store(hctx, page, length);
- mutex_unlock(&q->sysfs_lock);
+ res = -EAGAIN;
+ if (mutex_trylock(&q->sysfs_lock)) {
+ if (!blk_queue_dying(q))
+ res = entry->store(hctx, page, length);
+ else
+ res = -ENOENT;
+ mutex_unlock(&q->sysfs_lock);
+ }
return res;
}
--
2.7.4
On Tue, 2018-06-19 at 15:00 +0800, Jianchao Wang wrote:
> Currently, the kobject_del for kobjs of mq, hctx and ctx is invoked
> under sysfs_lock, lock inversion will come up when other one is
> acessing the associated sysfs file and trying to acquire the
> sysfs_lock. To fix it, use mutex_trylock in blk_mq_sysfs_ops and
> blk_mq_hw_sysfs_ops, if the lock in on contending, return -EAGAIN.
Is this a theoretical issue or something you actually ran into? Which lock
other than sysfs_lock do you think is involved in the lock inversion?
Bart.
Hi Bart
On 06/19/2018 11:20 PM, Bart Van Assche wrote:
> On Tue, 2018-06-19 at 15:00 +0800, Jianchao Wang wrote:
>> Currently, the kobject_del for kobjs of mq, hctx and ctx is invoked
>> under sysfs_lock, lock inversion will come up when other one is
>> acessing the associated sysfs file and trying to acquire the
>> sysfs_lock. To fix it, use mutex_trylock in blk_mq_sysfs_ops and
>> blk_mq_hw_sysfs_ops, if the lock in on contending, return -EAGAIN.
>
> Is this a theoretical issue or something you actually ran into? Which lock
> other than sysfs_lock do you think is involved in the lock inversion?
>
It is very easy to reproduce with following scripts.
script 0
while true
do
modprobe null_blk queue_mode=2 shared_tags=1
sleep 0.1
rmmod null_blk
sleep 0.1
done
script 1
file0="/sys/block/nullb0/mq/0/nr_tags"
file1="/sys/block/nullb0/mq/0/cpu0/rq_list"
while true;
do
if [ -e $file0 ];then
cat $file0
fi
if [ -e $file1 ];then
cat $file1
fi
done
Here is the hung task log:
[ 246.752087] INFO: task rmmod:12789 blocked for more than 30 seconds.
[ 246.752801] Not tainted 4.18.0-rc1 #88
[ 246.753458] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 246.754192] rmmod D 0 12789 3142 0x00000080
[ 246.754951] Call Trace:
[ 246.755715] ? __schedule+0x3f9/0xae0
[ 246.756546] schedule+0x3c/0x90
[ 246.757440] __kernfs_remove+0x1d0/0x2b0
[ 246.757644] ? wait_woken+0xb0/0xb0
[ 246.757850] kernfs_remove+0x1f/0x30
[ 246.758059] kobject_del+0x13/0x40
[ 246.758271] blk_mq_unregister_dev+0x4f/0xb0
[ 246.758488] blk_unregister_queue+0x71/0x100
[ 246.758709] del_gendisk+0x139/0x280
[ 246.758936] null_del_dev+0x40/0xf0 [null_blk]
[ 246.759165] null_exit+0x50/0xbec [null_blk]
[ 246.759397] __x64_sys_delete_module+0x12e/0x1d0
[ 246.759636] do_syscall_64+0x5a/0x1a0
[ 246.759876] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 246.760129] RIP: 0033:0x7fc518522927
[ 246.760481] Code: Bad RIP value.
[ 246.760736] RSP: 002b:00007ffee4c69b68 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[ 246.761005] RAX: ffffffffffffffda RBX: 00007ffee4c69bc8 RCX: 00007fc518522927
[ 246.761309] RDX: 000000000000000a RSI: 0000000000000800 RDI: 000055783881b248
[ 246.761612] RBP: 000055783881b1e0 R08: 0000000000000000 R09: 1999999999999999
[ 246.761902] R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffee4c69d90
[ 246.762199] R13: 00007ffee4c6b774 R14: 000055783881a010 R15: 000055783881b1e0
[ 246.762503] INFO: task cat:12790 blocked for more than 30 seconds.
[ 246.762812] Not tainted 4.18.0-rc1 #88
[ 246.763124] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 246.763453] cat D 0 12790 3141 0x00000080
[ 246.763789] Call Trace:
[ 246.764130] ? __schedule+0x3f9/0xae0
[ 246.764552] schedule+0x3c/0x90
[ 246.764895] schedule_preempt_disabled+0x14/0x20
[ 246.765244] __mutex_lock+0x41c/0x990
[ 246.765595] ? blk_mq_hw_sysfs_show+0x35/0x80
[ 246.765950] ? preempt_count_sub+0x92/0xd0
[ 246.766311] ? blk_mq_hw_sysfs_show+0x35/0x80
[ 246.766675] blk_mq_hw_sysfs_show+0x35/0x80
[ 246.767043] sysfs_kf_seq_show+0xad/0x100
[ 246.767416] seq_read+0xa5/0x410
[ 246.767790] __vfs_read+0x23/0x160
[ 246.768172] vfs_read+0xa0/0x140
[ 246.768627] ksys_read+0x45/0xa0
[ 246.769008] do_syscall_64+0x5a/0x1a0
[ 246.769391] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 246.769798] RIP: 0033:0x7f743e39e260
[ 246.770216] Code: Bad RIP value.
Thanks
Jianchao
>
>
>
On Wed, 2018-06-20 at 10:09 +0800, jianchao.wang wrote:
> It is very easy to reproduce with following scripts.
>
> script 0
> while true
> do
> modprobe null_blk queue_mode=2 shared_tags=1
> sleep 0.1
> rmmod null_blk
> sleep 0.1
> done
>
> script 1
> file0="/sys/block/nullb0/mq/0/nr_tags"
> file1="/sys/block/nullb0/mq/0/cpu0/rq_list"
> while true;
> do
> if [ -e $file0 ];then
> cat $file0
> fi
> if [ -e $file1 ];then
> cat $file1
> fi
> done
Hello Jianchao,
Thanks for having shared a reproducer. However, the approach of the patch you
posted doesn't seem like the right approach to me. I propose to proceed as
follows:
* Convert the reproducer into a blktests test and submit is as a patch to the
blktests project (https://github.com/osandov/blktests).
* Remove the mutex_lock/unlock(&sysfs_lock) calls from blk_cleanup_queue().
These calls are useless. Block drivers are required to call del_gendisk()
before calling blk_cleanup_queue(). That means that sysfs attributes are
removed synchronously before blk_cleanup_queue() is called. The following
statement in blk_cleanup_queue() verifies that:
WARN_ON_ONCE(q->kobj.state_in_sysfs);
BTW, this also means that the blk_queue_dying() checks in various show and
store methods are superfluous.
* Introduce a new mutex to serialize blk_mq_register_dev() and
blk_mq_sysfs_register() and blk_mq_sysfs_unregister(). I think it is wrong
that these functions use sysfs_mutex.
* Document the purpose of sysfs_mutex in include/linux/blkdev.h, namely to
serialize the sysfs .show() and .store() callback functions and also to
serialize elevator changes.
Thanks,
Bart.
Hi Bart
On 06/21/2018 12:18 AM, Bart Van Assche wrote:
> On Wed, 2018-06-20 at 10:09 +0800, jianchao.wang wrote:
>> It is very easy to reproduce with following scripts.
>>
>> script 0
>> while true
>> do
>> modprobe null_blk queue_mode=2 shared_tags=1
>> sleep 0.1
>> rmmod null_blk
>> sleep 0.1
>> done
>>
>> script 1
>> file0="/sys/block/nullb0/mq/0/nr_tags"
>> file1="/sys/block/nullb0/mq/0/cpu0/rq_list"
>> while true;
>> do
>> if [ -e $file0 ];then
>> cat $file0
>> fi
>> if [ -e $file1 ];then
>> cat $file1
>> fi
>> done
>
> Hello Jianchao,
>
> Thanks for having shared a reproducer. However, the approach of the patch you
> posted doesn't seem like the right approach to me. I propose to proceed as
> follows:
> * Convert the reproducer into a blktests test and submit is as a patch to the
> blktests project (https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_osandov_blktests&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=_TjwODcX-0o-OZfrVTFMpvbS0QbYxoya17aHM9pZAXw&s=wdVpTkE6PJX3GXvjXNwEup3opdflyCGY324CAYXSixY&e=).
> * Remove the mutex_lock/unlock(&sysfs_lock) calls from blk_cleanup_queue().
> These calls are useless. Block drivers are required to call del_gendisk()
> before calling blk_cleanup_queue(). That means that sysfs attributes are
> removed synchronously before blk_cleanup_queue() is called. The following
> statement in blk_cleanup_queue() verifies that:
> WARN_ON_ONCE(q->kobj.state_in_sysfs);
> BTW, this also means that the blk_queue_dying() checks in various show and
> store methods are superfluous.
> * Introduce a new mutex to serialize blk_mq_register_dev() and
> blk_mq_sysfs_register() and blk_mq_sysfs_unregister(). I think it is wrong
> that these functions use sysfs_mutex.
> * Document the purpose of sysfs_mutex in include/linux/blkdev.h, namely to
> serialize the sysfs .show() and .store() callback functions and also to
> serialize elevator changes.
>
Really appreciate your kindly and detailed directive.
I will post the V2 version based on your suggestions later.
Thanks
Jianchao
>
>
>
Hi Bart
On 06/21/2018 04:23 PM, jianchao.wang wrote:
> Hi Bart
>
> On 06/21/2018 12:18 AM, Bart Van Assche wrote:
>> On Wed, 2018-06-20 at 10:09 +0800, jianchao.wang wrote:
>>> It is very easy to reproduce with following scripts.
>>>
>>> script 0
>>> while true
>>> do
>>> modprobe null_blk queue_mode=2 shared_tags=1
>>> sleep 0.1
>>> rmmod null_blk
>>> sleep 0.1
>>> done
>>>
>>> script 1
>>> file0="/sys/block/nullb0/mq/0/nr_tags"
>>> file1="/sys/block/nullb0/mq/0/cpu0/rq_list"
>>> while true;
>>> do
>>> if [ -e $file0 ];then
>>> cat $file0
>>> fi
>>> if [ -e $file1 ];then
>>> cat $file1
>>> fi
>>> done
>>
>> Hello Jianchao,
>>
>> Thanks for having shared a reproducer. However, the approach of the patch you
>> posted doesn't seem like the right approach to me. I propose to proceed as
>> follows:
>> * Convert the reproducer into a blktests test and submit is as a patch to the
>> blktests project (https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_osandov_blktests&d=DwIGaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=_TjwODcX-0o-OZfrVTFMpvbS0QbYxoya17aHM9pZAXw&s=wdVpTkE6PJX3GXvjXNwEup3opdflyCGY324CAYXSixY&e=).
>> * Remove the mutex_lock/unlock(&sysfs_lock) calls from blk_cleanup_queue().
>> These calls are useless. Block drivers are required to call del_gendisk()
>> before calling blk_cleanup_queue(). That means that sysfs attributes are
>> removed synchronously before blk_cleanup_queue() is called. The following
>> statement in blk_cleanup_queue() verifies that:
>> WARN_ON_ONCE(q->kobj.state_in_sysfs);
>> BTW, this also means that the blk_queue_dying() checks in various show and
>> store methods are superfluous.
>> * Introduce a new mutex to serialize blk_mq_register_dev() and
>> blk_mq_sysfs_register() and blk_mq_sysfs_unregister(). I think it is wrong
>> that these functions use sysfs_mutex.
>> * Document the purpose of sysfs_mutex in include/linux/blkdev.h, namely to
>> serialize the sysfs .show() and .store() callback functions and also to
>> serialize elevator changes.
>>
>
> Really appreciate your kindly and detailed directive.
> I will post the V2 version based on your suggestions later.
>
V2 patch has been posted, would you please take a look at on them ?
https://marc.info/?l=linux-block&m=152965143412927&w=2
In addition, it is hard to add test case for this issue in blktests,
because it is hard to decide how many times or how long time to run
the test two scripts. If too long, it may delay the whole test, if
too short, the issue may cannot be exposed.
Thanks
Jianchao
On 06/25/18 20:47, jianchao.wang wrote:
> V2 patch has been posted, would you please take a look at on them ?
> https://marc.info/?l=linux-block&m=152965143412927&w=2
Hello Jianchao,
Patches 1/3 and 2/3 of that series look fine to me but I don't like the
approach of patch 3/3 so please do not expect a review from me for that
patch.
Thanks,
Bart.
On 06/28/2018 09:11 AM, jianchao.wang wrote:
> Hi Bart
>
> On 06/26/2018 11:36 PM, Bart Van Assche wrote:
>> On 06/25/18 20:47, jianchao.wang wrote:
>>> V2 patch has been posted, would you please take a look at on them ?
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__marc.info_-3Fl-3Dlinux-2Dblock-26m-3D152965143412927-26w-3D2&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=FPLHVLVF0V8ZCvwZJx7vHy9v01iAQogJ_IbvseGhElo&s=tQ0ZiK7VrV4j-6K_WEfiWAyxogOrC8hOm6coS9nAmYI&e=
>>
>> Hello Jianchao,
>>
>> Patches 1/3 and 2/3 of that series look fine to me but I don't like the approach of patch 3/3 so please do not expect a review from me for that patch.
>>
>
> Would you please explain the defects of the patch 3/3 ?
We still need some way to serialize the blk_register/unregister_queue, blk_mq_sysfs_register/unregister and sysfs queue, mq, hctx entries' show and store method. So I didn't introduce another mutex as your suggestions.
>
> Thanks
> Jianchao
>
Hi Bart
On 06/26/2018 11:36 PM, Bart Van Assche wrote:
> On 06/25/18 20:47, jianchao.wang wrote:
>> V2 patch has been posted, would you please take a look at on them ?
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__marc.info_-3Fl-3Dlinux-2Dblock-26m-3D152965143412927-26w-3D2&d=DwICaQ&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=FPLHVLVF0V8ZCvwZJx7vHy9v01iAQogJ_IbvseGhElo&s=tQ0ZiK7VrV4j-6K_WEfiWAyxogOrC8hOm6coS9nAmYI&e=
>
> Hello Jianchao,
>
> Patches 1/3 and 2/3 of that series look fine to me but I don't like the approach of patch 3/3 so please do not expect a review from me for that patch.
>
Would you please explain the defects of the patch 3/3 ?
Thanks
Jianchao