2023-11-25 08:18:07

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next v3 0/5] md: remove rcu protection to access rdev from conf

From: Yu Kuai <[email protected]>

Changes in v3:
- remove patch 1 from v2, and since all the print_conf() is called
while 'reconfig_mutex' is held, it's safe to remove
rcu_read_lock/unlock() directly.
- remove the definition of flag RemoveSynchronized;

Changes in v2:
- add cover leter in details.

The lifetime of rdev:

1. md_import_device() generate a rdev based on underlying disk;

mddev_lock()
rdev = kzalloc();
rdev->bdev = blkdev_get_by_dev();
mddev_unlock()

2. bind_rdev_to_array() add this rdev to mddev->disks;

mddev_lock()
kobject_add(&rdev->kobj, &mddev->kobj, ...);
list_add_rcu(&rdev->same_set, &mddev->disks);
mddev_unlock()

3. remove_and_add_spares() add this rdev to conf;

mddev_lock()
rdev_addable();
pers->hot_add_disk();
rcu_assign_pointer(conf->rdev, rdev);
mddev_unlock()

4. Use this array with rdev;

5. remove_and_add_spares() remove rdev from conf;

// triggered by sysfs/ioctl
mddev_lock()
rdev_removeable();
pers->hot_remove_disk();
rcu_assign_pointer(conf->rdev, NULL);
synchronize_rcu();
mddev_unlock()

// triggered by daemon
mddev_lock()
rdev_removeable();
synchronize_rcu(); -> this can't protect accessing rdev from conf
pers->hot_remove_disk();
rcu_assign_pointer(conf->rdev, NULL);
mddev_unlock()

6. md_kick_rdev_from_array() remove rdev from mddev->disks;

mddev_lock()
list_del_rcu(&rdev->same_set);
synchronize_rcu();
list_add(&rdev->same_set, &mddev->deleting)
mddev_unlock()
export_rdev

There are two separate rcu protection for rdev, and this pathset remove
the protection of conf(step 3 and 5), because it's safe to access rdev
from conf in following cases:

- If 'reconfig_mutex' is held, because rdev can't be added or rmoved to
conf;
- If there is normal IO inflight, because mddev_suspend() will wait for
IO to be done and prevent rdev to be added or removed to conf;
- If sync thread is running, because remove_and_add_spares() can only be
called from daemon thread when sync thread is done, and
'MD_RECOVERY_RUNNING' is also checked for ioctl/sysfs;
- if any spinlock or rcu_read_lock() is held, because synchronize_rcu()
from step 6 prevent rdev to be freed until spinlock is released or
rcu_read_unlock();

Yu Kuai (5):
md: remove flag RemoveSynchronized
md/raid10: remove rcu protection to access rdev from conf
md/raid1: remove rcu protection to access rdev from conf
md/raid5: remove rcu protection to access rdev from conf
md/md-multipath: remove rcu protection to access rdev from conf

drivers/md/md-multipath.c | 32 +++---
drivers/md/md.c | 37 ++-----
drivers/md/md.h | 5 -
drivers/md/raid1.c | 71 ++++--------
drivers/md/raid10.c | 222 ++++++++++----------------------------
drivers/md/raid5-cache.c | 11 +-
drivers/md/raid5-ppl.c | 16 +--
drivers/md/raid5.c | 191 +++++++++++---------------------
drivers/md/raid5.h | 4 +-
9 files changed, 168 insertions(+), 421 deletions(-)

--
2.39.2


2023-11-25 08:18:08

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next v3 5/5] md/md-multipath: remove rcu protection to access rdev from conf

From: Yu Kuai <[email protected]>

Because it's safe to accees rdev from conf:
- If any spinlock is held, because synchronize_rcu() from
md_kick_rdev_from_array() will prevent 'rdev' to be freed until
spinlock is released;
- If there is normal IO inflight, because mddev_suspend() will prevent
rdev to be added or removed from array;

And these will cover all the scenarios in md-multipath.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-multipath.c | 23 ++++++++++++-----------
1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index aa77133f3188..19c8625ea642 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -32,17 +32,15 @@ static int multipath_map (struct mpconf *conf)
* now we use the first available disk.
*/

- rcu_read_lock();
for (i = 0; i < disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
+ struct md_rdev *rdev = conf->multipaths[i].rdev;
+
if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
return i;
}
}
- rcu_read_unlock();

pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
return (-1);
@@ -137,14 +135,16 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
struct mpconf *conf = mddev->private;
int i;

+ lockdep_assert_held(&mddev->lock);
+
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
- seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev);
+
+ seq_printf(seq, "%s",
+ rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_putc(seq, ']');
}

@@ -182,7 +182,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
conf->raid_disks - mddev->degraded);
}

-static void print_multipath_conf (struct mpconf *conf)
+static void print_multipath_conf(struct mpconf *conf)
{
int i;
struct multipath_info *tmp;
@@ -195,6 +195,7 @@ static void print_multipath_conf (struct mpconf *conf)
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);

+ lockdep_assert_held(&conf->mddev->reconfig_mutex);
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->multipaths + i;
if (tmp->rdev)
@@ -231,7 +232,7 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
spin_unlock_irq(&conf->device_lock);
- rcu_assign_pointer(p->rdev, rdev);
+ WRITE_ONCE(p->rdev, rdev);
err = 0;
break;
}
@@ -257,7 +258,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
goto abort;
}
- p->rdev = NULL;
+ WRITE_ONCE(p->rdev, NULL);
err = md_integrity_register(mddev);
}
abort:
--
2.39.2

2023-11-25 08:18:59

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next v3 3/5] md/raid1: remove rcu protection to access rdev from conf

From: Yu Kuai <[email protected]>

Because it's safe to accees rdev from conf:
- If any spinlock is held, because synchronize_rcu() from
md_kick_rdev_from_array() will prevent 'rdev' to be freed until
spinlock is released;
- If 'reconfig_lock' is held, because rdev can't be added or removed from
array;
- If there is normal IO inflight, because mddev_suspend() will prevent
rdev to be added or removed from array;
- If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is
checked in remove_and_add_spares().

And these will cover all the scenarios in raid1.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1.c | 62 +++++++++++++++++-----------------------------
1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a678e0e6e102..9348f1709512 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
int choose_first;
int choose_next_idle;

- rcu_read_lock();
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
@@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
unsigned int pending;
bool nonrot;

- rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| test_bit(Faulty, &rdev->flags))
@@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
}

if (best_disk >= 0) {
- rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
+ rdev = conf->mirrors[best_disk].rdev;
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
@@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect

conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
- rcu_read_unlock();
*max_sectors = sectors;

return best_disk;
@@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,

if (r1bio_existed) {
/* Need to get the block device name carefully */
- struct md_rdev *rdev;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
+ struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
+
if (rdev)
snprintf(b, sizeof(b), "%pg", rdev->bdev);
else
strcpy(b, "???");
- rcu_read_unlock();
}

/*
@@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,

disks = conf->raid_disks * 2;
blocked_rdev = NULL;
- rcu_read_lock();
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;

/*
* The write-behind io is only attempted on drives marked as
@@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
}
r1_bio->bios[i] = bio;
}
- rcu_read_unlock();

if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */
@@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
struct r1conf *conf = mddev->private;
int i;

+ lockdep_assert_held(&mddev->lock);
+
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
conf->raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
+
seq_printf(seq, "%s",
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_printf(seq, "]");
}

@@ -1691,16 +1686,15 @@ static void print_conf(struct r1conf *conf)
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);

- rcu_read_lock();
+ lockdep_assert_held(&conf->mddev->reconfig_mutex);
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
if (rdev)
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
i, !test_bit(In_sync, &rdev->flags),
!test_bit(Faulty, &rdev->flags),
rdev->bdev);
}
- rcu_read_unlock();
}

static void close_sync(struct r1conf *conf)
@@ -1810,7 +1804,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
+ WRITE_ONCE(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1826,7 +1820,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = repl_slot;
err = 0;
conf->fullsync = 1;
- rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
+ WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
}

print_conf(conf);
@@ -1862,7 +1856,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
goto abort;
}
- p->rdev = NULL;
+ WRITE_ONCE(p->rdev, NULL);
if (conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
@@ -1883,7 +1877,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
goto abort;
}
clear_bit(Replacement, &repl->flags);
- p->rdev = repl;
+ WRITE_ONCE(p->rdev, repl);
conf->mirrors[conf->raid_disks + number].rdev = NULL;
unfreeze_array(conf);
}
@@ -2281,8 +2275,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
sector_t first_bad;
int bad_sectors;

- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
@@ -2290,15 +2283,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
success = 1;
rdev_dec_pending(rdev, mddev);
if (success)
break;
- } else
- rcu_read_unlock();
+ }
+
d++;
if (d == conf->raid_disks * 2)
d = 0;
@@ -2317,29 +2309,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
if (d==0)
d = conf->raid_disks * 2;
d--;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE);
rdev_dec_pending(rdev, mddev);
- } else
- rcu_read_unlock();
+ }
}
d = start;
while (d != read_disk) {
if (d==0)
d = conf->raid_disks * 2;
d--;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
@@ -2350,8 +2337,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
rdev->bdev);
}
rdev_dec_pending(rdev, mddev);
- } else
- rcu_read_unlock();
+ }
}
sectors -= s;
sect += s;
@@ -2732,7 +2718,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,

r1_bio = raid1_alloc_init_r1buf(conf);

- rcu_read_lock();
/*
* If we get a correctably read error during resync or recovery,
* we might want to read from a different device. So we
@@ -2753,7 +2738,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev;
bio = r1_bio->bios[i];

- rdev = rcu_dereference(conf->mirrors[i].rdev);
+ rdev = conf->mirrors[i].rdev;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
@@ -2811,7 +2796,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_opf |= MD_FAILFAST;
}
}
- rcu_read_unlock();
if (disk < 0)
disk = wonly;
r1_bio->read_disk = disk;
--
2.39.2

2023-11-28 03:51:09

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH -next v3 0/5] md: remove rcu protection to access rdev from conf

On Sat, Nov 25, 2023 at 12:16 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Changes in v3:
> - remove patch 1 from v2, and since all the print_conf() is called
> while 'reconfig_mutex' is held, it's safe to remove
> rcu_read_lock/unlock() directly.
> - remove the definition of flag RemoveSynchronized;

Applied v3 to md-next. Thanks!

Song