From: Yu Kuai <[email protected]>
commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device
from an md array via sysfs") delay the deleting of rdev, however, this
introduce a window that rdev can be added again while the deleting is
not done yet, and sysfs will complain about duplicate filename.
Follow up patches try to fix this problem by flush workqueue, however,
flush_rdev_wq() is just dead code, the progress in
md_kick_rdev_from_array():
1) list_del_rcu(&rdev->same_set);
2) synchronize_rcu();
3) queue_work(md_rdev_misc_wq, &rdev->del_work);
So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
never pass, in the meantime, if work is queued, then rdev can never be
found in the list.
flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
this approach is not good:
- the workqueue is global, this synchronization for all raid disks is
not necessary.
- flush_workqueue can't be called under 'reconfig_mutex', there is still
a small window between flush_workqueue() and mddev_lock() that other
context can queue new work, hence the problem is not solved completely.
sysfs already have apis to support delete itself through writer, and
these apis, specifically sysfs_break/unbreak_active_protection(), is used
to support deleting rdev synchronously. Therefore, the above commit can be
reverted, and sysfs duplicate filename can be avoided.
A new mdadm regression test is proposed as well.
Link: https://lore.kernel.org/linux-raid/[email protected]/
Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
Signed-off-by: Yu Kuai <[email protected]>
---
Changes in v2:
- rebase from the latest md-next branch
drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------
drivers/md/md.h | 8 +++++
2 files changed, 51 insertions(+), 41 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7455bc9d8498..cafb457d614c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
+static void export_rdev(struct md_rdev *rdev);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
+ mutex_init(&mddev->delete_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
+ INIT_LIST_HEAD(&mddev->deleting);
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
@@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev)
static const struct attribute_group md_redundancy_group;
+static void md_free_rdev(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct md_rdev *tmp;
+
+ if (list_empty_careful(&mddev->deleting))
+ return;
+
+ mutex_lock(&mddev->delete_mutex);
+ list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
+ list_del_init(&rdev->same_set);
+ kobject_del(&rdev->kobj);
+ export_rdev(rdev);
+ }
+ mutex_unlock(&mddev->delete_mutex);
+}
+
void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
@@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
} else
mutex_unlock(&mddev->reconfig_mutex);
+ md_free_rdev(mddev);
+
/* As we've dropped the mutex we need a spinlock to
* make sure the thread doesn't disappear
*/
@@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return err;
}
-static void rdev_delayed_delete(struct work_struct *ws)
-{
- struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
- kobject_del(&rdev->kobj);
- kobject_put(&rdev->kobj);
-}
-
void md_autodetect_dev(dev_t dev);
static void export_rdev(struct md_rdev *rdev)
@@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev)
static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
+ struct mddev *mddev = rdev->mddev;
+
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
@@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
rdev->sysfs_unack_badblocks = NULL;
rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
- /* We need to delay this, otherwise we can deadlock when
- * writing to 'remove' to "dev/state". We also need
- * to delay it due to rcu usage.
- */
+
synchronize_rcu();
- INIT_WORK(&rdev->del_work, rdev_delayed_delete);
- kobject_get(&rdev->kobj);
- queue_work(md_rdev_misc_wq, &rdev->del_work);
- export_rdev(rdev);
+
+ /*
+ * kobject_del() will wait for all in progress writers to be done, where
+ * reconfig_mutex is held, hence it can't be called under
+ * reconfig_mutex and it's delayed to mddev_unlock().
+ */
+ mutex_lock(&mddev->delete_mutex);
+ list_add(&rdev->same_set, &mddev->deleting);
+ mutex_unlock(&mddev->delete_mutex);
}
static void export_array(struct mddev *mddev)
@@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
+ struct kernfs_node *kn = NULL;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+
+ if (entry->store == state_store && cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+
rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
@@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
}
+
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
+
return rv;
}
@@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL;
}
-/* need to ensure rdev_delayed_delete() has completed */
-static void flush_rdev_wq(struct mddev *mddev)
-{
- struct md_rdev *rdev;
-
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (work_pending(&rdev->del_work)) {
- flush_workqueue(md_rdev_misc_wq);
- break;
- }
- rcu_read_unlock();
-}
-
static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
@@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- flush_rdev_wq(mddev);
err = mddev_lock(mddev);
if (err)
return err;
@@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
* removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
- flush_workqueue(md_rdev_misc_wq);
mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
@@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
- if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
- flush_rdev_wq(mddev);
-
if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait,
@@ -9618,10 +9627,6 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
- md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
- if (!md_rdev_misc_wq)
- goto err_rdev_misc_wq;
-
ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
goto err_md;
@@ -9640,8 +9645,6 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
- destroy_workqueue(md_rdev_misc_wq);
-err_rdev_misc_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
@@ -9937,7 +9940,6 @@ static __exit void md_exit(void)
}
spin_unlock(&all_mddevs_lock);
- destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1eec65cf783c..4d191db831da 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -531,6 +531,14 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */
+ /*
+ * Temporarily store rdev that will be finally removed when
+ * reconfig_mutex is unlocked.
+ */
+ struct list_head deleting;
+ /* Protect the deleting list */
+ struct mutex delete_mutex;
+
bool has_superblocks:1;
bool fail_last_dev:1;
bool serialize_policy:1;
--
2.39.2
Dear Yu,
Thank you for your patch. Just a few nits, that can be ignored.
Am 22.05.23 um 15:32 schrieb Yu Kuai:
> From: Yu Kuai <[email protected]>
>
> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device
I’d start with capital letter: Commit ….
> from an md array via sysfs") delay the deleting of rdev, however, this
1. delay*s*
2. s/deleting/deletion/
> introduce a window that rdev can be added again while the deleting is
1. introduce*s*
2. s/deleting/deletion/
> not done yet, and sysfs will complain about duplicate filename.
>
> Follow up patches try to fix this problem by flush workqueue, however,
flush*ing* the work queue
> flush_rdev_wq() is just dead code, the progress in
> md_kick_rdev_from_array():
… is:
> 1) list_del_rcu(&rdev->same_set);
> 2) synchronize_rcu();
> 3) queue_work(md_rdev_misc_wq, &rdev->del_work);
>
> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
> never pass, in the meantime, if work is queued, then rdev can never be
> found in the list.
>
> flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
> this approach is not good:
> - the workqueue is global, this synchronization for all raid disks is
> not necessary.
The work queue …
> - flush_workqueue can't be called under 'reconfig_mutex', there is still
> a small window between flush_workqueue() and mddev_lock() that other
> context can queue new work, hence the problem is not solved completely.
context*s*
> sysfs already have apis to support delete itself through writer, and
1. s/have/has/
2. deleting
> these apis, specifically sysfs_break/unbreak_active_protection(), is used
s/is/are/
> to support deleting rdev synchronously. Therefore, the above commit can be
> reverted, and sysfs duplicate filename can be avoided.
>
> A new mdadm regression test is proposed as well.
It’s not included, right? Then I’d remove the sentence, or write: … is
going to be proposed …
Kind regards,
Paul
> Link: https://lore.kernel.org/linux-raid/[email protected]/
> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> Changes in v2:
> - rebase from the latest md-next branch
>
> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------
> drivers/md/md.h | 8 +++++
> 2 files changed, 51 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 7455bc9d8498..cafb457d614c 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
> static int remove_and_add_spares(struct mddev *mddev,
> struct md_rdev *this);
> static void mddev_detach(struct mddev *mddev);
> +static void export_rdev(struct md_rdev *rdev);
>
> /*
> * Default number of read corrections we'll attempt on an rdev
> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev)
> {
> mutex_init(&mddev->open_mutex);
> mutex_init(&mddev->reconfig_mutex);
> + mutex_init(&mddev->delete_mutex);
> mutex_init(&mddev->bitmap_info.mutex);
> INIT_LIST_HEAD(&mddev->disks);
> INIT_LIST_HEAD(&mddev->all_mddevs);
> + INIT_LIST_HEAD(&mddev->deleting);
> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
> atomic_set(&mddev->active, 1);
> atomic_set(&mddev->openers, 0);
> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev)
>
> static const struct attribute_group md_redundancy_group;
>
> +static void md_free_rdev(struct mddev *mddev)
> +{
> + struct md_rdev *rdev;
> + struct md_rdev *tmp;
> +
> + if (list_empty_careful(&mddev->deleting))
> + return;
> +
> + mutex_lock(&mddev->delete_mutex);
> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
> + list_del_init(&rdev->same_set);
> + kobject_del(&rdev->kobj);
> + export_rdev(rdev);
> + }
> + mutex_unlock(&mddev->delete_mutex);
> +}
> +
> void mddev_unlock(struct mddev *mddev)
> {
> if (mddev->to_remove) {
> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
> } else
> mutex_unlock(&mddev->reconfig_mutex);
>
> + md_free_rdev(mddev);
> +
> /* As we've dropped the mutex we need a spinlock to
> * make sure the thread doesn't disappear
> */
> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
> return err;
> }
>
> -static void rdev_delayed_delete(struct work_struct *ws)
> -{
> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
> - kobject_del(&rdev->kobj);
> - kobject_put(&rdev->kobj);
> -}
> -
> void md_autodetect_dev(dev_t dev);
>
> static void export_rdev(struct md_rdev *rdev)
> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev)
>
> static void md_kick_rdev_from_array(struct md_rdev *rdev)
> {
> + struct mddev *mddev = rdev->mddev;
> +
> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> list_del_rcu(&rdev->same_set);
> pr_debug("md: unbind<%pg>\n", rdev->bdev);
> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
> rdev->sysfs_unack_badblocks = NULL;
> rdev->sysfs_badblocks = NULL;
> rdev->badblocks.count = 0;
> - /* We need to delay this, otherwise we can deadlock when
> - * writing to 'remove' to "dev/state". We also need
> - * to delay it due to rcu usage.
> - */
> +
> synchronize_rcu();
> - INIT_WORK(&rdev->del_work, rdev_delayed_delete);
> - kobject_get(&rdev->kobj);
> - queue_work(md_rdev_misc_wq, &rdev->del_work);
> - export_rdev(rdev);
> +
> + /*
> + * kobject_del() will wait for all in progress writers to be done, where
> + * reconfig_mutex is held, hence it can't be called under
> + * reconfig_mutex and it's delayed to mddev_unlock().
> + */
> + mutex_lock(&mddev->delete_mutex);
> + list_add(&rdev->same_set, &mddev->deleting);
> + mutex_unlock(&mddev->delete_mutex);
> }
>
> static void export_array(struct mddev *mddev)
> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> {
> struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
> + struct kernfs_node *kn = NULL;
> ssize_t rv;
> struct mddev *mddev = rdev->mddev;
>
> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> return -EIO;
> if (!capable(CAP_SYS_ADMIN))
> return -EACCES;
> +
> + if (entry->store == state_store && cmd_match(page, "remove"))
> + kn = sysfs_break_active_protection(kobj, attr);
> +
> rv = mddev ? mddev_lock(mddev) : -ENODEV;
> if (!rv) {
> if (rdev->mddev == NULL)
> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> rv = entry->store(rdev, page, length);
> mddev_unlock(mddev);
> }
> +
> + if (kn)
> + sysfs_unbreak_active_protection(kn);
> +
> return rv;
> }
>
> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
> return -EINVAL;
> }
>
> -/* need to ensure rdev_delayed_delete() has completed */
> -static void flush_rdev_wq(struct mddev *mddev)
> -{
> - struct md_rdev *rdev;
> -
> - rcu_read_lock();
> - rdev_for_each_rcu(rdev, mddev)
> - if (work_pending(&rdev->del_work)) {
> - flush_workqueue(md_rdev_misc_wq);
> - break;
> - }
> - rcu_read_unlock();
> -}
> -
> static ssize_t
> new_dev_store(struct mddev *mddev, const char *buf, size_t len)
> {
> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
> minor != MINOR(dev))
> return -EOVERFLOW;
>
> - flush_rdev_wq(mddev);
> err = mddev_lock(mddev);
> if (err)
> return err;
> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
> * removed (mddev_delayed_delete).
> */
> flush_workqueue(md_misc_wq);
> - flush_workqueue(md_rdev_misc_wq);
>
> mutex_lock(&disks_mutex);
> mddev = mddev_alloc(dev);
> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
>
> }
>
> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
> - flush_rdev_wq(mddev);
> -
> if (cmd == HOT_REMOVE_DISK)
> /* need to ensure recovery thread has run */
> wait_event_interruptible_timeout(mddev->sb_wait,
> @@ -9618,10 +9627,6 @@ static int __init md_init(void)
> if (!md_misc_wq)
> goto err_misc_wq;
>
> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
> - if (!md_rdev_misc_wq)
> - goto err_rdev_misc_wq;
> -
> ret = __register_blkdev(MD_MAJOR, "md", md_probe);
> if (ret < 0)
> goto err_md;
> @@ -9640,8 +9645,6 @@ static int __init md_init(void)
> err_mdp:
> unregister_blkdev(MD_MAJOR, "md");
> err_md:
> - destroy_workqueue(md_rdev_misc_wq);
> -err_rdev_misc_wq:
> destroy_workqueue(md_misc_wq);
> err_misc_wq:
> destroy_workqueue(md_wq);
> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void)
> }
> spin_unlock(&all_mddevs_lock);
>
> - destroy_workqueue(md_rdev_misc_wq);
> destroy_workqueue(md_misc_wq);
> destroy_workqueue(md_wq);
> }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1eec65cf783c..4d191db831da 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -531,6 +531,14 @@ struct mddev {
> unsigned int good_device_nr; /* good device num within cluster raid */
> unsigned int noio_flag; /* for memalloc scope API */
>
> + /*
> + * Temporarily store rdev that will be finally removed when
> + * reconfig_mutex is unlocked.
> + */
> + struct list_head deleting;
> + /* Protect the deleting list */
> + struct mutex delete_mutex;
> +
> bool has_superblocks:1;
> bool fail_last_dev:1;
> bool serialize_policy:1;
On Mon, May 22, 2023 at 6:35 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device
> from an md array via sysfs") delay the deleting of rdev, however, this
> introduce a window that rdev can be added again while the deleting is
> not done yet, and sysfs will complain about duplicate filename.
>
> Follow up patches try to fix this problem by flush workqueue, however,
> flush_rdev_wq() is just dead code, the progress in
> md_kick_rdev_from_array():
>
> 1) list_del_rcu(&rdev->same_set);
> 2) synchronize_rcu();
> 3) queue_work(md_rdev_misc_wq, &rdev->del_work);
>
> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
> never pass, in the meantime, if work is queued, then rdev can never be
> found in the list.
>
> flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
> this approach is not good:
> - the workqueue is global, this synchronization for all raid disks is
> not necessary.
> - flush_workqueue can't be called under 'reconfig_mutex', there is still
> a small window between flush_workqueue() and mddev_lock() that other
> context can queue new work, hence the problem is not solved completely.
>
> sysfs already have apis to support delete itself through writer, and
> these apis, specifically sysfs_break/unbreak_active_protection(), is used
> to support deleting rdev synchronously. Therefore, the above commit can be
> reverted, and sysfs duplicate filename can be avoided.
>
> A new mdadm regression test is proposed as well.
>
> Link: https://lore.kernel.org/linux-raid/[email protected]/
> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
> Signed-off-by: Yu Kuai <[email protected]>
We got build time warning with this patch:
drivers/md/md.c:90:33: warning: ‘md_rdev_misc_wq’ defined but not used
[-Wunused-variable]
static struct workqueue_struct *md_rdev_misc_wq;
^~~~~~~~~~~~~~~
Thanks,
Song
> ---
> Changes in v2:
> - rebase from the latest md-next branch
>
> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------
> drivers/md/md.h | 8 +++++
> 2 files changed, 51 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 7455bc9d8498..cafb457d614c 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
> static int remove_and_add_spares(struct mddev *mddev,
> struct md_rdev *this);
> static void mddev_detach(struct mddev *mddev);
> +static void export_rdev(struct md_rdev *rdev);
>
> /*
> * Default number of read corrections we'll attempt on an rdev
> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev)
> {
> mutex_init(&mddev->open_mutex);
> mutex_init(&mddev->reconfig_mutex);
> + mutex_init(&mddev->delete_mutex);
> mutex_init(&mddev->bitmap_info.mutex);
> INIT_LIST_HEAD(&mddev->disks);
> INIT_LIST_HEAD(&mddev->all_mddevs);
> + INIT_LIST_HEAD(&mddev->deleting);
> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
> atomic_set(&mddev->active, 1);
> atomic_set(&mddev->openers, 0);
> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev)
>
> static const struct attribute_group md_redundancy_group;
>
> +static void md_free_rdev(struct mddev *mddev)
> +{
> + struct md_rdev *rdev;
> + struct md_rdev *tmp;
> +
> + if (list_empty_careful(&mddev->deleting))
> + return;
> +
> + mutex_lock(&mddev->delete_mutex);
> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
> + list_del_init(&rdev->same_set);
> + kobject_del(&rdev->kobj);
> + export_rdev(rdev);
> + }
> + mutex_unlock(&mddev->delete_mutex);
> +}
> +
> void mddev_unlock(struct mddev *mddev)
> {
> if (mddev->to_remove) {
> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
> } else
> mutex_unlock(&mddev->reconfig_mutex);
>
> + md_free_rdev(mddev);
> +
> /* As we've dropped the mutex we need a spinlock to
> * make sure the thread doesn't disappear
> */
> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
> return err;
> }
>
> -static void rdev_delayed_delete(struct work_struct *ws)
> -{
> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
> - kobject_del(&rdev->kobj);
> - kobject_put(&rdev->kobj);
> -}
> -
> void md_autodetect_dev(dev_t dev);
>
> static void export_rdev(struct md_rdev *rdev)
> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev)
>
> static void md_kick_rdev_from_array(struct md_rdev *rdev)
> {
> + struct mddev *mddev = rdev->mddev;
> +
> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> list_del_rcu(&rdev->same_set);
> pr_debug("md: unbind<%pg>\n", rdev->bdev);
> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
> rdev->sysfs_unack_badblocks = NULL;
> rdev->sysfs_badblocks = NULL;
> rdev->badblocks.count = 0;
> - /* We need to delay this, otherwise we can deadlock when
> - * writing to 'remove' to "dev/state". We also need
> - * to delay it due to rcu usage.
> - */
> +
> synchronize_rcu();
> - INIT_WORK(&rdev->del_work, rdev_delayed_delete);
> - kobject_get(&rdev->kobj);
> - queue_work(md_rdev_misc_wq, &rdev->del_work);
> - export_rdev(rdev);
> +
> + /*
> + * kobject_del() will wait for all in progress writers to be done, where
> + * reconfig_mutex is held, hence it can't be called under
> + * reconfig_mutex and it's delayed to mddev_unlock().
> + */
> + mutex_lock(&mddev->delete_mutex);
> + list_add(&rdev->same_set, &mddev->deleting);
> + mutex_unlock(&mddev->delete_mutex);
> }
>
> static void export_array(struct mddev *mddev)
> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> {
> struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
> + struct kernfs_node *kn = NULL;
> ssize_t rv;
> struct mddev *mddev = rdev->mddev;
>
> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> return -EIO;
> if (!capable(CAP_SYS_ADMIN))
> return -EACCES;
> +
> + if (entry->store == state_store && cmd_match(page, "remove"))
> + kn = sysfs_break_active_protection(kobj, attr);
> +
> rv = mddev ? mddev_lock(mddev) : -ENODEV;
> if (!rv) {
> if (rdev->mddev == NULL)
> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
> rv = entry->store(rdev, page, length);
> mddev_unlock(mddev);
> }
> +
> + if (kn)
> + sysfs_unbreak_active_protection(kn);
> +
> return rv;
> }
>
> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
> return -EINVAL;
> }
>
> -/* need to ensure rdev_delayed_delete() has completed */
> -static void flush_rdev_wq(struct mddev *mddev)
> -{
> - struct md_rdev *rdev;
> -
> - rcu_read_lock();
> - rdev_for_each_rcu(rdev, mddev)
> - if (work_pending(&rdev->del_work)) {
> - flush_workqueue(md_rdev_misc_wq);
> - break;
> - }
> - rcu_read_unlock();
> -}
> -
> static ssize_t
> new_dev_store(struct mddev *mddev, const char *buf, size_t len)
> {
> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
> minor != MINOR(dev))
> return -EOVERFLOW;
>
> - flush_rdev_wq(mddev);
> err = mddev_lock(mddev);
> if (err)
> return err;
> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
> * removed (mddev_delayed_delete).
> */
> flush_workqueue(md_misc_wq);
> - flush_workqueue(md_rdev_misc_wq);
>
> mutex_lock(&disks_mutex);
> mddev = mddev_alloc(dev);
> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
>
> }
>
> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
> - flush_rdev_wq(mddev);
> -
> if (cmd == HOT_REMOVE_DISK)
> /* need to ensure recovery thread has run */
> wait_event_interruptible_timeout(mddev->sb_wait,
> @@ -9618,10 +9627,6 @@ static int __init md_init(void)
> if (!md_misc_wq)
> goto err_misc_wq;
>
> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
> - if (!md_rdev_misc_wq)
> - goto err_rdev_misc_wq;
> -
> ret = __register_blkdev(MD_MAJOR, "md", md_probe);
> if (ret < 0)
> goto err_md;
> @@ -9640,8 +9645,6 @@ static int __init md_init(void)
> err_mdp:
> unregister_blkdev(MD_MAJOR, "md");
> err_md:
> - destroy_workqueue(md_rdev_misc_wq);
> -err_rdev_misc_wq:
> destroy_workqueue(md_misc_wq);
> err_misc_wq:
> destroy_workqueue(md_wq);
> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void)
> }
> spin_unlock(&all_mddevs_lock);
>
> - destroy_workqueue(md_rdev_misc_wq);
> destroy_workqueue(md_misc_wq);
> destroy_workqueue(md_wq);
> }
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 1eec65cf783c..4d191db831da 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -531,6 +531,14 @@ struct mddev {
> unsigned int good_device_nr; /* good device num within cluster raid */
> unsigned int noio_flag; /* for memalloc scope API */
>
> + /*
> + * Temporarily store rdev that will be finally removed when
> + * reconfig_mutex is unlocked.
> + */
> + struct list_head deleting;
> + /* Protect the deleting list */
> + struct mutex delete_mutex;
> +
> bool has_superblocks:1;
> bool fail_last_dev:1;
> bool serialize_policy:1;
> --
> 2.39.2
>
Hi,
在 2023/05/23 3:02, Song Liu 写道:
> On Mon, May 22, 2023 at 6:35 AM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device
>> from an md array via sysfs") delay the deleting of rdev, however, this
>> introduce a window that rdev can be added again while the deleting is
>> not done yet, and sysfs will complain about duplicate filename.
>>
>> Follow up patches try to fix this problem by flush workqueue, however,
>> flush_rdev_wq() is just dead code, the progress in
>> md_kick_rdev_from_array():
>>
>> 1) list_del_rcu(&rdev->same_set);
>> 2) synchronize_rcu();
>> 3) queue_work(md_rdev_misc_wq, &rdev->del_work);
>>
>> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
>> never pass, in the meantime, if work is queued, then rdev can never be
>> found in the list.
>>
>> flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
>> this approach is not good:
>> - the workqueue is global, this synchronization for all raid disks is
>> not necessary.
>> - flush_workqueue can't be called under 'reconfig_mutex', there is still
>> a small window between flush_workqueue() and mddev_lock() that other
>> context can queue new work, hence the problem is not solved completely.
>>
>> sysfs already have apis to support delete itself through writer, and
>> these apis, specifically sysfs_break/unbreak_active_protection(), is used
>> to support deleting rdev synchronously. Therefore, the above commit can be
>> reverted, and sysfs duplicate filename can be avoided.
>>
>> A new mdadm regression test is proposed as well.
>>
>> Link: https://lore.kernel.org/linux-raid/[email protected]/
>> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
>> Signed-off-by: Yu Kuai <[email protected]>
>
> We got build time warning with this patch:
>
> drivers/md/md.c:90:33: warning: ‘md_rdev_misc_wq’ defined but not used
> [-Wunused-variable]
> static struct workqueue_struct *md_rdev_misc_wq;
Will fix this in the next version,
Thanks,
Kuai
> ^~~~~~~~~~~~~~~
>
> Thanks,
> Song
>
>> ---
>> Changes in v2:
>> - rebase from the latest md-next branch
>>
>> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------
>> drivers/md/md.h | 8 +++++
>> 2 files changed, 51 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 7455bc9d8498..cafb457d614c 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
>> static int remove_and_add_spares(struct mddev *mddev,
>> struct md_rdev *this);
>> static void mddev_detach(struct mddev *mddev);
>> +static void export_rdev(struct md_rdev *rdev);
>>
>> /*
>> * Default number of read corrections we'll attempt on an rdev
>> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev)
>> {
>> mutex_init(&mddev->open_mutex);
>> mutex_init(&mddev->reconfig_mutex);
>> + mutex_init(&mddev->delete_mutex);
>> mutex_init(&mddev->bitmap_info.mutex);
>> INIT_LIST_HEAD(&mddev->disks);
>> INIT_LIST_HEAD(&mddev->all_mddevs);
>> + INIT_LIST_HEAD(&mddev->deleting);
>> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
>> atomic_set(&mddev->active, 1);
>> atomic_set(&mddev->openers, 0);
>> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev)
>>
>> static const struct attribute_group md_redundancy_group;
>>
>> +static void md_free_rdev(struct mddev *mddev)
>> +{
>> + struct md_rdev *rdev;
>> + struct md_rdev *tmp;
>> +
>> + if (list_empty_careful(&mddev->deleting))
>> + return;
>> +
>> + mutex_lock(&mddev->delete_mutex);
>> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
>> + list_del_init(&rdev->same_set);
>> + kobject_del(&rdev->kobj);
>> + export_rdev(rdev);
>> + }
>> + mutex_unlock(&mddev->delete_mutex);
>> +}
>> +
>> void mddev_unlock(struct mddev *mddev)
>> {
>> if (mddev->to_remove) {
>> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
>> } else
>> mutex_unlock(&mddev->reconfig_mutex);
>>
>> + md_free_rdev(mddev);
>> +
>> /* As we've dropped the mutex we need a spinlock to
>> * make sure the thread doesn't disappear
>> */
>> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
>> return err;
>> }
>>
>> -static void rdev_delayed_delete(struct work_struct *ws)
>> -{
>> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
>> - kobject_del(&rdev->kobj);
>> - kobject_put(&rdev->kobj);
>> -}
>> -
>> void md_autodetect_dev(dev_t dev);
>>
>> static void export_rdev(struct md_rdev *rdev)
>> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev)
>>
>> static void md_kick_rdev_from_array(struct md_rdev *rdev)
>> {
>> + struct mddev *mddev = rdev->mddev;
>> +
>> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> list_del_rcu(&rdev->same_set);
>> pr_debug("md: unbind<%pg>\n", rdev->bdev);
>> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
>> rdev->sysfs_unack_badblocks = NULL;
>> rdev->sysfs_badblocks = NULL;
>> rdev->badblocks.count = 0;
>> - /* We need to delay this, otherwise we can deadlock when
>> - * writing to 'remove' to "dev/state". We also need
>> - * to delay it due to rcu usage.
>> - */
>> +
>> synchronize_rcu();
>> - INIT_WORK(&rdev->del_work, rdev_delayed_delete);
>> - kobject_get(&rdev->kobj);
>> - queue_work(md_rdev_misc_wq, &rdev->del_work);
>> - export_rdev(rdev);
>> +
>> + /*
>> + * kobject_del() will wait for all in progress writers to be done, where
>> + * reconfig_mutex is held, hence it can't be called under
>> + * reconfig_mutex and it's delayed to mddev_unlock().
>> + */
>> + mutex_lock(&mddev->delete_mutex);
>> + list_add(&rdev->same_set, &mddev->deleting);
>> + mutex_unlock(&mddev->delete_mutex);
>> }
>>
>> static void export_array(struct mddev *mddev)
>> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
>> {
>> struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
>> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
>> + struct kernfs_node *kn = NULL;
>> ssize_t rv;
>> struct mddev *mddev = rdev->mddev;
>>
>> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
>> return -EIO;
>> if (!capable(CAP_SYS_ADMIN))
>> return -EACCES;
>> +
>> + if (entry->store == state_store && cmd_match(page, "remove"))
>> + kn = sysfs_break_active_protection(kobj, attr);
>> +
>> rv = mddev ? mddev_lock(mddev) : -ENODEV;
>> if (!rv) {
>> if (rdev->mddev == NULL)
>> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
>> rv = entry->store(rdev, page, length);
>> mddev_unlock(mddev);
>> }
>> +
>> + if (kn)
>> + sysfs_unbreak_active_protection(kn);
>> +
>> return rv;
>> }
>>
>> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
>> return -EINVAL;
>> }
>>
>> -/* need to ensure rdev_delayed_delete() has completed */
>> -static void flush_rdev_wq(struct mddev *mddev)
>> -{
>> - struct md_rdev *rdev;
>> -
>> - rcu_read_lock();
>> - rdev_for_each_rcu(rdev, mddev)
>> - if (work_pending(&rdev->del_work)) {
>> - flush_workqueue(md_rdev_misc_wq);
>> - break;
>> - }
>> - rcu_read_unlock();
>> -}
>> -
>> static ssize_t
>> new_dev_store(struct mddev *mddev, const char *buf, size_t len)
>> {
>> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
>> minor != MINOR(dev))
>> return -EOVERFLOW;
>>
>> - flush_rdev_wq(mddev);
>> err = mddev_lock(mddev);
>> if (err)
>> return err;
>> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
>> * removed (mddev_delayed_delete).
>> */
>> flush_workqueue(md_misc_wq);
>> - flush_workqueue(md_rdev_misc_wq);
>>
>> mutex_lock(&disks_mutex);
>> mddev = mddev_alloc(dev);
>> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
>>
>> }
>>
>> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
>> - flush_rdev_wq(mddev);
>> -
>> if (cmd == HOT_REMOVE_DISK)
>> /* need to ensure recovery thread has run */
>> wait_event_interruptible_timeout(mddev->sb_wait,
>> @@ -9618,10 +9627,6 @@ static int __init md_init(void)
>> if (!md_misc_wq)
>> goto err_misc_wq;
>>
>> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
>> - if (!md_rdev_misc_wq)
>> - goto err_rdev_misc_wq;
>> -
>> ret = __register_blkdev(MD_MAJOR, "md", md_probe);
>> if (ret < 0)
>> goto err_md;
>> @@ -9640,8 +9645,6 @@ static int __init md_init(void)
>> err_mdp:
>> unregister_blkdev(MD_MAJOR, "md");
>> err_md:
>> - destroy_workqueue(md_rdev_misc_wq);
>> -err_rdev_misc_wq:
>> destroy_workqueue(md_misc_wq);
>> err_misc_wq:
>> destroy_workqueue(md_wq);
>> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void)
>> }
>> spin_unlock(&all_mddevs_lock);
>>
>> - destroy_workqueue(md_rdev_misc_wq);
>> destroy_workqueue(md_misc_wq);
>> destroy_workqueue(md_wq);
>> }
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 1eec65cf783c..4d191db831da 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -531,6 +531,14 @@ struct mddev {
>> unsigned int good_device_nr; /* good device num within cluster raid */
>> unsigned int noio_flag; /* for memalloc scope API */
>>
>> + /*
>> + * Temporarily store rdev that will be finally removed when
>> + * reconfig_mutex is unlocked.
>> + */
>> + struct list_head deleting;
>> + /* Protect the deleting list */
>> + struct mutex delete_mutex;
>> +
>> bool has_superblocks:1;
>> bool fail_last_dev:1;
>> bool serialize_policy:1;
>> --
>> 2.39.2
>>
> .
>
Hi,
在 2023/05/22 22:03, Paul Menzel 写道:
> Dear Yu,
>
>
> Thank you for your patch. Just a few nits, that can be ignored.
Thanks for these sugestions, I'll update them in v3.
Kuai
>
> Am 22.05.23 um 15:32 schrieb Yu Kuai:
>> From: Yu Kuai <[email protected]>
>>
>> commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device
>
> I’d start with capital letter: Commit ….
>
>> from an md array via sysfs") delay the deleting of rdev, however, this
>
> 1. delay*s*
> 2. s/deleting/deletion/
>
>> introduce a window that rdev can be added again while the deleting is
>
> 1. introduce*s*
> 2. s/deleting/deletion/
>
>> not done yet, and sysfs will complain about duplicate filename.
>>
>> Follow up patches try to fix this problem by flush workqueue, however,
>
> flush*ing* the work queue
>
>> flush_rdev_wq() is just dead code, the progress in
>> md_kick_rdev_from_array():
>
> … is:
>
>> 1) list_del_rcu(&rdev->same_set);
>> 2) synchronize_rcu();
>> 3) queue_work(md_rdev_misc_wq, &rdev->del_work);
>>
>> So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
>> never pass, in the meantime, if work is queued, then rdev can never be
>> found in the list.
>>
>> flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
>> this approach is not good:
>> - the workqueue is global, this synchronization for all raid disks is
>> not necessary.
>
> The work queue …
>
>> - flush_workqueue can't be called under 'reconfig_mutex', there is still
>> a small window between flush_workqueue() and mddev_lock() that other
>> context can queue new work, hence the problem is not solved
>> completely.
>
> context*s*
>
>> sysfs already have apis to support delete itself through writer, and
>
> 1. s/have/has/
> 2. deleting
>
>> these apis, specifically sysfs_break/unbreak_active_protection(), is used
>
> s/is/are/
>
>> to support deleting rdev synchronously. Therefore, the above commit
>> can be
>> reverted, and sysfs duplicate filename can be avoided.
>>
>> A new mdadm regression test is proposed as well.
>
> It’s not included, right? Then I’d remove the sentence, or write: … is
> going to be proposed …
>
>
> Kind regards,
>
> Paul
>
>
>> Link:
>> https://lore.kernel.org/linux-raid/[email protected]/
>>
>> Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a
>> device from an md array via sysfs")
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> Changes in v2:
>> - rebase from the latest md-next branch
>>
>> drivers/md/md.c | 84 +++++++++++++++++++++++++------------------------
>> drivers/md/md.h | 8 +++++
>> 2 files changed, 51 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 7455bc9d8498..cafb457d614c 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -92,6 +92,7 @@ static struct workqueue_struct *md_rdev_misc_wq;
>> static int remove_and_add_spares(struct mddev *mddev,
>> struct md_rdev *this);
>> static void mddev_detach(struct mddev *mddev);
>> +static void export_rdev(struct md_rdev *rdev);
>> /*
>> * Default number of read corrections we'll attempt on an rdev
>> @@ -643,9 +644,11 @@ void mddev_init(struct mddev *mddev)
>> {
>> mutex_init(&mddev->open_mutex);
>> mutex_init(&mddev->reconfig_mutex);
>> + mutex_init(&mddev->delete_mutex);
>> mutex_init(&mddev->bitmap_info.mutex);
>> INIT_LIST_HEAD(&mddev->disks);
>> INIT_LIST_HEAD(&mddev->all_mddevs);
>> + INIT_LIST_HEAD(&mddev->deleting);
>> timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
>> atomic_set(&mddev->active, 1);
>> atomic_set(&mddev->openers, 0);
>> @@ -747,6 +750,23 @@ static void mddev_free(struct mddev *mddev)
>> static const struct attribute_group md_redundancy_group;
>> +static void md_free_rdev(struct mddev *mddev)
>> +{
>> + struct md_rdev *rdev;
>> + struct md_rdev *tmp;
>> +
>> + if (list_empty_careful(&mddev->deleting))
>> + return;
>> +
>> + mutex_lock(&mddev->delete_mutex);
>> + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
>> + list_del_init(&rdev->same_set);
>> + kobject_del(&rdev->kobj);
>> + export_rdev(rdev);
>> + }
>> + mutex_unlock(&mddev->delete_mutex);
>> +}
>> +
>> void mddev_unlock(struct mddev *mddev)
>> {
>> if (mddev->to_remove) {
>> @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
>> } else
>> mutex_unlock(&mddev->reconfig_mutex);
>> + md_free_rdev(mddev);
>> +
>> /* As we've dropped the mutex we need a spinlock to
>> * make sure the thread doesn't disappear
>> */
>> @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev
>> *rdev, struct mddev *mddev)
>> return err;
>> }
>> -static void rdev_delayed_delete(struct work_struct *ws)
>> -{
>> - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
>> - kobject_del(&rdev->kobj);
>> - kobject_put(&rdev->kobj);
>> -}
>> -
>> void md_autodetect_dev(dev_t dev);
>> static void export_rdev(struct md_rdev *rdev)
>> @@ -2452,6 +2467,8 @@ static void export_rdev(struct md_rdev *rdev)
>> static void md_kick_rdev_from_array(struct md_rdev *rdev)
>> {
>> + struct mddev *mddev = rdev->mddev;
>> +
>> bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> list_del_rcu(&rdev->same_set);
>> pr_debug("md: unbind<%pg>\n", rdev->bdev);
>> @@ -2465,15 +2482,17 @@ static void md_kick_rdev_from_array(struct
>> md_rdev *rdev)
>> rdev->sysfs_unack_badblocks = NULL;
>> rdev->sysfs_badblocks = NULL;
>> rdev->badblocks.count = 0;
>> - /* We need to delay this, otherwise we can deadlock when
>> - * writing to 'remove' to "dev/state". We also need
>> - * to delay it due to rcu usage.
>> - */
>> +
>> synchronize_rcu();
>> - INIT_WORK(&rdev->del_work, rdev_delayed_delete);
>> - kobject_get(&rdev->kobj);
>> - queue_work(md_rdev_misc_wq, &rdev->del_work);
>> - export_rdev(rdev);
>> +
>> + /*
>> + * kobject_del() will wait for all in progress writers to be
>> done, where
>> + * reconfig_mutex is held, hence it can't be called under
>> + * reconfig_mutex and it's delayed to mddev_unlock().
>> + */
>> + mutex_lock(&mddev->delete_mutex);
>> + list_add(&rdev->same_set, &mddev->deleting);
>> + mutex_unlock(&mddev->delete_mutex);
>> }
>> static void export_array(struct mddev *mddev)
>> @@ -3541,6 +3560,7 @@ rdev_attr_store(struct kobject *kobj, struct
>> attribute *attr,
>> {
>> struct rdev_sysfs_entry *entry = container_of(attr, struct
>> rdev_sysfs_entry, attr);
>> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
>> + struct kernfs_node *kn = NULL;
>> ssize_t rv;
>> struct mddev *mddev = rdev->mddev;
>> @@ -3548,6 +3568,10 @@ rdev_attr_store(struct kobject *kobj, struct
>> attribute *attr,
>> return -EIO;
>> if (!capable(CAP_SYS_ADMIN))
>> return -EACCES;
>> +
>> + if (entry->store == state_store && cmd_match(page, "remove"))
>> + kn = sysfs_break_active_protection(kobj, attr);
>> +
>> rv = mddev ? mddev_lock(mddev) : -ENODEV;
>> if (!rv) {
>> if (rdev->mddev == NULL)
>> @@ -3556,6 +3580,10 @@ rdev_attr_store(struct kobject *kobj, struct
>> attribute *attr,
>> rv = entry->store(rdev, page, length);
>> mddev_unlock(mddev);
>> }
>> +
>> + if (kn)
>> + sysfs_unbreak_active_protection(kn);
>> +
>> return rv;
>> }
>> @@ -4479,20 +4507,6 @@ null_show(struct mddev *mddev, char *page)
>> return -EINVAL;
>> }
>> -/* need to ensure rdev_delayed_delete() has completed */
>> -static void flush_rdev_wq(struct mddev *mddev)
>> -{
>> - struct md_rdev *rdev;
>> -
>> - rcu_read_lock();
>> - rdev_for_each_rcu(rdev, mddev)
>> - if (work_pending(&rdev->del_work)) {
>> - flush_workqueue(md_rdev_misc_wq);
>> - break;
>> - }
>> - rcu_read_unlock();
>> -}
>> -
>> static ssize_t
>> new_dev_store(struct mddev *mddev, const char *buf, size_t len)
>> {
>> @@ -4520,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char
>> *buf, size_t len)
>> minor != MINOR(dev))
>> return -EOVERFLOW;
>> - flush_rdev_wq(mddev);
>> err = mddev_lock(mddev);
>> if (err)
>> return err;
>> @@ -5590,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
>> * removed (mddev_delayed_delete).
>> */
>> flush_workqueue(md_misc_wq);
>> - flush_workqueue(md_rdev_misc_wq);
>> mutex_lock(&disks_mutex);
>> mddev = mddev_alloc(dev);
>> @@ -7553,9 +7565,6 @@ static int md_ioctl(struct block_device *bdev,
>> fmode_t mode,
>> }
>> - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
>> - flush_rdev_wq(mddev);
>> -
>> if (cmd == HOT_REMOVE_DISK)
>> /* need to ensure recovery thread has run */
>> wait_event_interruptible_timeout(mddev->sb_wait,
>> @@ -9618,10 +9627,6 @@ static int __init md_init(void)
>> if (!md_misc_wq)
>> goto err_misc_wq;
>> - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
>> - if (!md_rdev_misc_wq)
>> - goto err_rdev_misc_wq;
>> -
>> ret = __register_blkdev(MD_MAJOR, "md", md_probe);
>> if (ret < 0)
>> goto err_md;
>> @@ -9640,8 +9645,6 @@ static int __init md_init(void)
>> err_mdp:
>> unregister_blkdev(MD_MAJOR, "md");
>> err_md:
>> - destroy_workqueue(md_rdev_misc_wq);
>> -err_rdev_misc_wq:
>> destroy_workqueue(md_misc_wq);
>> err_misc_wq:
>> destroy_workqueue(md_wq);
>> @@ -9937,7 +9940,6 @@ static __exit void md_exit(void)
>> }
>> spin_unlock(&all_mddevs_lock);
>> - destroy_workqueue(md_rdev_misc_wq);
>> destroy_workqueue(md_misc_wq);
>> destroy_workqueue(md_wq);
>> }
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 1eec65cf783c..4d191db831da 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -531,6 +531,14 @@ struct mddev {
>> unsigned int good_device_nr; /* good device num
>> within cluster raid */
>> unsigned int noio_flag; /* for memalloc scope API */
>> + /*
>> + * Temporarily store rdev that will be finally removed when
>> + * reconfig_mutex is unlocked.
>> + */
>> + struct list_head deleting;
>> + /* Protect the deleting list */
>> + struct mutex delete_mutex;
>> +
>> bool has_superblocks:1;
>> bool fail_last_dev:1;
>> bool serialize_policy:1;
>
> .
>
Hi Yu,
kernel test robot noticed the following build warnings:
[auto build test WARNING on song-md/md-next]
[also build test WARNING on linus/master v6.4-rc3 next-20230522]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/md-fix-duplicate-filename-for-rdev/20230522-221940
base: git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git md-next
patch link: https://lore.kernel.org/r/20230522133225.2983667-1-yukuai1%40huaweicloud.com
patch subject: [PATCH v2] md: fix duplicate filename for rdev
config: i386-defconfig (https://download.01.org/0day-ci/archive/20230523/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/b6addfb36d27d4c98b397edec92ed6feeaf7afab
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yu-Kuai/md-fix-duplicate-filename-for-rdev/20230522-221940
git checkout b6addfb36d27d4c98b397edec92ed6feeaf7afab
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=i386 olddefconfig
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/md/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All warnings (new ones prefixed by >>):
>> drivers/md/md.c:90:33: warning: 'md_rdev_misc_wq' defined but not used [-Wunused-variable]
90 | static struct workqueue_struct *md_rdev_misc_wq;
| ^~~~~~~~~~~~~~~
vim +/md_rdev_misc_wq +90 drivers/md/md.c
edb39c9deda87d Goldwyn Rodrigues 2014-03-29 86
90b08710e41a07 Bernd Schubert 2008-05-23 87 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
e804ac780e2f01 Tejun Heo 2010-10-15 88 static struct workqueue_struct *md_wq;
e804ac780e2f01 Tejun Heo 2010-10-15 89 static struct workqueue_struct *md_misc_wq;
cc1ffe61c026e2 Guoqing Jiang 2020-04-04 @90 static struct workqueue_struct *md_rdev_misc_wq;
90b08710e41a07 Bernd Schubert 2008-05-23 91
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki