From: Bian Yu <[email protected]>
When raid5 hit a fresh badblock, this badblock will flagged as unack
badblock until md_update_sb is called.
But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb
in md_check_recovery, the badblock will always be unack, so raid5d
thread enter a infinite loop and never can unregister sync_thread
that cause deadlock.
To solve this, before md_stop_writes call md_unregister_thread, set
MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge
MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev
to wait md_update_sb. so raid5d thread can be stopped.
I can reproduce it by using follow way:
When raid5 array is recovering and hit a fresh badblock, then shutdown array at once.
[ 480.850203] Not tainted 3.11.0-next-20130906+ #4
[ 480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 480.854380] [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60
[ 480.854386] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
[ 480.854395] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
[ 480.854400] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[ 480.854405] [<ffffffff815370bf>] md_thread+0x11f/0x170
[ 480.854410] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
[ 480.854415] [<ffffffff8106d956>] kthread+0xd6/0xe0
[ 480.854423] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
[ 480.854428] [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0
[ 480.854432] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
[ 480.854435] no locks held by md0_resync/3246.
[ 480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds.
[ 480.854438] Not tainted 3.11.0-next-20130906+ #4
[ 480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 480.854442] mdadm D 0000000000000000 5024 3257 3209 0x00000080
[ 480.854445] ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120
[ 480.854447] ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8
[ 480.854449] ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8
[ 480.854449] Call Trace:
[ 480.854452] [<ffffffff81745fa4>] schedule+0x24/0x70
[ 480.854453] [<ffffffff81742725>] schedule_timeout+0x175/0x200
[ 480.854455] [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130
[ 480.854457] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
[ 480.854459] [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
[ 480.854461] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[ 480.854463] [<ffffffff81746600>] wait_for_completion+0xa0/0x110
[ 480.854465] [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300
[ 480.854467] [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0
[ 480.854468] [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90
[ 480.854470] [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140
[ 480.854472] [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80
[ 480.854473] [<ffffffff8153c911>] do_md_stop+0x91/0x4d0
[ 480.854475] [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0
[ 480.854477] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
[ 480.854479] [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0
[ 480.854481] [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920
Signed-off-by: Bian Yu <[email protected]>
---
drivers/md/md.c | 2 ++
drivers/md/md.h | 4 ++++
drivers/md/raid5.c | 3 ++-
3 files changed, 8 insertions(+), 1 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e..54ef71f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_STOPPING_WRITES, &mddev->flags);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
@@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ clear_bit(MD_STOPPING_WRITES, &mddev->flags);
}
void md_stop_writes(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 608050c..a24ae1d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -214,6 +214,10 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it.
*/
+#define MD_STOPPING_WRITES 5 /* If set, raid5 shouldn't set unacknowledged
+ * badblock blocked in analyse_stripe to avoid
+ * infinite loop.
+ */
int suspended;
atomic_t active_io;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f9972e2..ff1aecf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev) {
is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
&first_bad, &bad_sectors);
- if (s->blocked_rdev == NULL
+ if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags)
+ && s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) {
if (is_bad < 0)
--
1.7.1
On Mon, 2 Sep 2013 01:09:55 -0400 [email protected] wrote:
> From: Bian Yu <[email protected]>
>
> When raid5 hit a fresh badblock, this badblock will flagged as unack
> badblock until md_update_sb is called.
> But md_stop/reboot/md_set_readonly will avoid raid5d call md_update_sb
> in md_check_recovery, the badblock will always be unack, so raid5d
> thread enter a infinite loop and never can unregister sync_thread
> that cause deadlock.
>
> To solve this, before md_stop_writes call md_unregister_thread, set
> MD_STOPPING_WRITES on mddev->flags. In raid5.c analyse_stripe judge
> MD_STOPPING_WRITES bit on mddev->flags, if setted don't block rdev
> to wait md_update_sb. so raid5d thread can be stopped.
>
> I can reproduce it by using follow way:
> When raid5 array is recovering and hit a fresh badblock, then shutdown array at once.
>
> [ 480.850203] Not tainted 3.11.0-next-20130906+ #4
> [ 480.852344] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 480.854380] [<ffffffff8153b324>] md_do_sync+0x7e4/0xe60
> [ 480.854386] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
> [ 480.854395] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
> [ 480.854400] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [ 480.854405] [<ffffffff815370bf>] md_thread+0x11f/0x170
> [ 480.854410] [<ffffffff81536fa0>] ? md_unregister_thread+0x90/0x90
> [ 480.854415] [<ffffffff8106d956>] kthread+0xd6/0xe0
> [ 480.854423] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
> [ 480.854428] [<ffffffff8174ff2c>] ret_from_fork+0x7c/0xb0
> [ 480.854432] [<ffffffff8106d880>] ? __init_kthread_worker+0x70/0x70
> [ 480.854435] no locks held by md0_resync/3246.
> [ 480.854437] INFO: task mdadm:3257 blocked for more than 120 seconds.
> [ 480.854438] Not tainted 3.11.0-next-20130906+ #4
> [ 480.854439] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 480.854442] mdadm D 0000000000000000 5024 3257 3209 0x00000080
> [ 480.854445] ffff880138c37b18 0000000000000046 00000000ffffffff ffff880037d3b120
> [ 480.854447] ffff88013a038720 ffff88013a038000 0000000000013500 ffff880138c37fd8
> [ 480.854449] ffff880138c36010 0000000000013500 0000000000013500 ffff880138c37fd8
> [ 480.854449] Call Trace:
> [ 480.854452] [<ffffffff81745fa4>] schedule+0x24/0x70
> [ 480.854453] [<ffffffff81742725>] schedule_timeout+0x175/0x200
> [ 480.854455] [<ffffffff810a6d30>] ? mark_held_locks+0x80/0x130
> [ 480.854457] [<ffffffff81747fcb>] ? _raw_spin_unlock_irq+0x2b/0x40
> [ 480.854459] [<ffffffff810a709d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
> [ 480.854461] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [ 480.854463] [<ffffffff81746600>] wait_for_completion+0xa0/0x110
> [ 480.854465] [<ffffffff8107d160>] ? try_to_wake_up+0x300/0x300
> [ 480.854467] [<ffffffff8106ddbc>] kthread_stop+0x4c/0xe0
> [ 480.854468] [<ffffffff81536f5e>] md_unregister_thread+0x4e/0x90
> [ 480.854470] [<ffffffff8153965d>] md_reap_sync_thread+0x1d/0x140
> [ 480.854472] [<ffffffff815397ab>] __md_stop_writes+0x2b/0x80
> [ 480.854473] [<ffffffff8153c911>] do_md_stop+0x91/0x4d0
> [ 480.854475] [<ffffffff8153e8a7>] ? md_ioctl+0xf7/0x15c0
> [ 480.854477] [<ffffffff810a716d>] ? trace_hardirqs_on+0xd/0x10
> [ 480.854479] [<ffffffff8153f6a9>] md_ioctl+0xef9/0x15c0
> [ 480.854481] [<ffffffff8113145d>] ? handle_mm_fault+0x17d/0x920
>
> Signed-off-by: Bian Yu <[email protected]>
> ---
> drivers/md/md.c | 2 ++
> drivers/md/md.h | 4 ++++
> drivers/md/raid5.c | 3 ++-
> 3 files changed, 8 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index adf4d7e..54ef71f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5278,6 +5278,7 @@ static void md_clean(struct mddev *mddev)
> static void __md_stop_writes(struct mddev *mddev)
> {
> set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> + set_bit(MD_STOPPING_WRITES, &mddev->flags);
> if (mddev->sync_thread) {
> set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> md_reap_sync_thread(mddev);
> @@ -5294,6 +5295,7 @@ static void __md_stop_writes(struct mddev *mddev)
> mddev->in_sync = 1;
> md_update_sb(mddev, 1);
> }
> + clear_bit(MD_STOPPING_WRITES, &mddev->flags);
> }
>
> void md_stop_writes(struct mddev *mddev)
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 608050c..a24ae1d 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -214,6 +214,10 @@ struct mddev {
> #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
> * md_ioctl checked on it.
> */
> +#define MD_STOPPING_WRITES 5 /* If set, raid5 shouldn't set unacknowledged
> + * badblock blocked in analyse_stripe to avoid
> + * infinite loop.
> + */
>
> int suspended;
> atomic_t active_io;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index f9972e2..ff1aecf 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -3446,7 +3446,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
> if (rdev) {
> is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
> &first_bad, &bad_sectors);
> - if (s->blocked_rdev == NULL
> + if (!test_bit(MD_STOPPING_WRITES, &conf->mddev->flags)
> + && s->blocked_rdev == NULL
> && (test_bit(Blocked, &rdev->flags)
> || is_bad < 0)) {
> if (is_bad < 0)
Thanks for including the extra details in the patch, but it wasn't only that
I didn't think it should be needed (I was wrong there). I also don't like
the patch. It isn't at all clear to me that it will do the right thing.
There is a reason that we block until the bad block lists has been updated,
and to just not block because it is inconvenient just doesn't seem right.
I would rather change the sequencing so that the badblock list can be updated
at this point.
Something like that patch below, but I'm not 100% sure of it yet and I'm
about to go on leave so I'm not sure I'll be able to commit to it for a while.
If you could review and test I would appreciate it.
Thanks,
NeilBrown
diff --git a/drivers/md/md.c b/drivers/md/md.c
index adf4d7e..e4d78c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3170,6 +3170,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EACCES;
rv = mddev ? mddev_lock(mddev): -EBUSY;
if (!rv) {
+ mutex_lock(&mddev->open_mutex);
+ clear_bit(MD_STILL_CLOSED, &mddev->flags);
+ mutex_unlock(&mddev->open_mutex);
if (rdev->mddev == NULL)
rv = -EBUSY;
else
@@ -4764,6 +4767,9 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
flush_workqueue(md_misc_wq);
rv = mddev_lock(mddev);
if (!rv) {
+ mutex_lock(&mddev->open_mutex);
+ clear_bit(MD_STILL_CLOSED, &mddev->flags);
+ mutex_unlock(&mddev->open_mutex);
rv = entry->store(mddev, page, length);
mddev_unlock(mddev);
}
@@ -5279,8 +5285,10 @@ static void __md_stop_writes(struct mddev *mddev)
{
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->sync_thread) {
+ mddev_unlock(&mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
+ mddev_lock(&mddev);
}
del_timer_sync(&mddev->safemode_timer);
@@ -5337,7 +5345,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
err = -EBUSY;
goto out;
}
- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+ if (mddev->pers)
+ __md_stop_writes(mddev);
+ if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
/* Someone opened the device since we flushed it
* so page cache could be dirty and it is too late
* to flush. So abort
@@ -5346,8 +5356,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
return -EBUSY;
}
if (mddev->pers) {
- __md_stop_writes(mddev);
-
err = -ENXIO;
if (mddev->ro==1)
goto out;
@@ -5379,7 +5387,9 @@ static int do_md_stop(struct mddev * mddev, int mode,
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
+ if (mddev->pers)
+ __md_stop_writes(mddev);
+ if (!test_bit(MD_STILL_CLOSED, &mddev->flags)) {
/* Someone opened the device since we flushed it
* so page cache could be dirty and it is too late
* to flush. So abort
@@ -5391,7 +5401,6 @@ static int do_md_stop(struct mddev * mddev, int mode,
if (mddev->ro)
set_disk_ro(disk, 0);
- __md_stop_writes(mddev);
__md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL;