LinuxLists.cc - [PATCH -next 0/6] md: remvoe rcu protection to access rdev from conf

2023-10-16 01:29:26

Subject: [PATCH -next 0/6] md: remvoe rcu protection to access rdev from conf

From: Yu Kuai <[email protected]>

Yu Kuai (6):
md: remove useless debug code to print configuration
md: remove flag RemoveSynchronized
md/raid1: remove rcu protection to access rdev from conf
md/raid10: remove rcu protection to access rdev from conf
md/raid5: remove rcu protection to access rdev from conf
md/md-multipath: remove rcu protection to access rdev from conf

drivers/md/md-multipath.c | 29 ++---
drivers/md/md.c | 37 +-----
drivers/md/raid1.c | 94 ++++-----------
drivers/md/raid10.c | 248 +++++++++-----------------------------
drivers/md/raid5-cache.c | 11 +-
drivers/md/raid5-ppl.c | 16 +--
drivers/md/raid5.c | 225 ++++++++++------------------------
drivers/md/raid5.h | 4 +-
8 files changed, 163 insertions(+), 501 deletions(-)

--
2.39.2

2023-10-16 01:29:37

by Yu Kuai

[permalink] [raw]

Subject: [PATCH -next 3/6] md/raid1: remove rcu protection to access rdev from conf

2023-10-16 01:29:54

by Yu Kuai

[permalink] [raw]

Subject: [PATCH -next 4/6] md/raid10: remove rcu protection to access rdev from conf

From: Yu Kuai <[email protected]>

It's safe to accees rdev from conf:
- If any spinlock is held, because synchronize_rcu() from
md_kick_rdev_from_array() will prevent 'rdev' to be freed until
spinlock is released;
- If 'reconfig_lock' is held, because rdev can't be added or removed from
array;
- If there is normal IO inflight, because mddev_suspend() will prevent
rdev to be added or removed from array;
- If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is
checked in remove_and_add_spares().

And these will cover all the scenarios in raid10.

This patch also cleanup the code to handle the case that replacement
replace rdev while IO is still inflight.

Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid10.c | 210 ++++++++++++--------------------------------
1 file changed, 57 insertions(+), 153 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33ab00323cae..806a7fe2f74a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -743,7 +743,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
struct geom *geo = &conf->geo;

raid10_find_phys(conf, r10_bio);
- rcu_read_lock();
best_dist_slot = -1;
min_pending = UINT_MAX;
best_dist_rdev = NULL;
@@ -775,18 +774,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
- rdev = rcu_dereference(conf->mirrors[disk].replacement);
+ rdev = conf->mirrors[disk].replacement;
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
r10_bio->devs[slot].addr + sectors >
- rdev->recovery_offset) {
- /*
- * Read replacement first to prevent reading both rdev
- * and replacement as NULL during replacement replace
- * rdev.
- */
- smp_mb();
- rdev = rcu_dereference(conf->mirrors[disk].rdev);
- }
+ rdev->recovery_offset)
+ rdev = conf->mirrors[disk].rdev;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags))
continue;
@@ -876,7 +868,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
r10_bio->read_slot = slot;
} else
rdev = NULL;
- rcu_read_unlock();
*max_sectors = best_good_sectors;

return rdev;
@@ -1198,9 +1189,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp = GFP_NOIO | __GFP_HIGH;

- rcu_read_lock();
disk = r10_bio->devs[slot].devnum;
- err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ err_rdev = conf->mirrors[disk].rdev;
if (err_rdev)
snprintf(b, sizeof(b), "%pg", err_rdev->bdev);
else {
@@ -1208,7 +1198,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
/* This never gets dereferenced */
err_rdev = r10_bio->devs[slot].rdev;
}
- rcu_read_unlock();
}

if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
@@ -1279,15 +1268,8 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
int devnum = r10_bio->devs[n_copy].devnum;
struct bio *mbio;

- if (replacement) {
- rdev = conf->mirrors[devnum].replacement;
- if (rdev == NULL) {
- /* Replacement just got moved to main 'rdev' */
- smp_mb();
- rdev = conf->mirrors[devnum].rdev;
- }
- } else
- rdev = conf->mirrors[devnum].rdev;
+ rdev = replacement ? conf->mirrors[devnum].replacement :
+ conf->mirrors[devnum].rdev;

mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
if (replacement)
@@ -1321,25 +1303,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
}
}

-static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
- struct md_rdev **prrdev)
-{
- struct md_rdev *rdev, *rrdev;
-
- rrdev = rcu_dereference(mirror->replacement);
- /*
- * Read replacement first to prevent reading both rdev and
- * replacement as NULL during replacement replace rdev.
- */
- smp_mb();
- rdev = rcu_dereference(mirror->rdev);
- if (rdev == rrdev)
- rrdev = NULL;
-
- *prrdev = rrdev;
- return rdev;
-}
-
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
int i;
@@ -1348,11 +1311,11 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)

retry_wait:
blocked_rdev = NULL;
- rcu_read_lock();
for (i = 0; i < conf->copies; i++) {
struct md_rdev *rdev, *rrdev;

- rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
+ rdev = conf->mirrors[i].rdev;
+ rrdev = conf->mirrors[i].replacement;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
@@ -1391,7 +1354,6 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
}
}
}
- rcu_read_unlock();

if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
@@ -1474,14 +1436,14 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,

wait_blocked_dev(mddev, r10_bio);

- rcu_read_lock();
max_sectors = r10_bio->sectors;

for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev;

- rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
+ rdev = conf->mirrors[d].rdev;
+ rrdev = conf->mirrors[d].replacement;
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1535,7 +1497,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
atomic_inc(&rrdev->nr_pending);
}
}
- rcu_read_unlock();

if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
@@ -1625,17 +1586,8 @@ static void raid10_end_discard_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);

dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- if (repl)
- rdev = conf->mirrors[dev].replacement;
- if (!rdev) {
- /*
- * raid10_remove_disk uses smp_mb to make sure rdev is set to
- * replacement before setting replacement to NULL. It can read
- * rdev first without barrier protect even replacement is NULL
- */
- smp_rmb();
- rdev = conf->mirrors[dev].rdev;
- }
+ rdev = repl ? conf->mirrors[dev].replacement :
+ conf->mirrors[dev].rdev;

raid_end_discard_bio(r10_bio);
rdev_dec_pending(rdev, conf->mddev);
@@ -1785,11 +1737,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
*/
- rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev, *rrdev;

- rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
+ rdev = conf->mirrors[disk].rdev;
+ rrdev = conf->mirrors[disk].replacement;
r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL;

@@ -1809,7 +1761,6 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
atomic_inc(&rrdev->nr_pending);
}
}
- rcu_read_unlock();

atomic_set(&r10_bio->remaining, 1);
for (disk = 0; disk < geo->raid_disks; disk++) {
@@ -1939,6 +1890,8 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
struct r10conf *conf = mddev->private;
int i;

+ lockdep_assert_held(&mddev->lock);
+
if (conf->geo.near_copies < conf->geo.raid_disks)
seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
if (conf->geo.near_copies > 1)
@@ -1953,12 +1906,11 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
}
seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
conf->geo.raid_disks - mddev->degraded);
- rcu_read_lock();
for (i = 0; i < conf->geo.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
+
seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
- rcu_read_unlock();
seq_printf(seq, "]");
}

@@ -1980,7 +1932,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
ncopies = conf->geo.near_copies;
}

- rcu_read_lock();
do {
int n = conf->copies;
int cnt = 0;
@@ -1988,7 +1939,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
while (n--) {
struct md_rdev *rdev;
if (this != ignore &&
- (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
+ (rdev = conf->mirrors[this].rdev) &&
test_bit(In_sync, &rdev->flags))
cnt++;
this = (this+1) % disks;
@@ -1999,7 +1950,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
} while (first != 0);
has_enough = 1;
out:
- rcu_read_unlock();
return has_enough;
}

@@ -2164,7 +2114,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
err = 0;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
- rcu_assign_pointer(p->rdev, rdev);
+ WRITE_ONCE(p->rdev, rdev);
break;
}

@@ -2178,7 +2128,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
conf->fullsync = 1;
- rcu_assign_pointer(p->replacement, rdev);
+ WRITE_ONCE(p->replacement, rdev);
}

return err;
@@ -2218,15 +2168,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
err = -EBUSY;
goto abort;
}
- *rdevp = NULL;
+ WRITE_ONCE(*rdevp, NULL);
if (p->replacement) {
/* We must have just cleared 'rdev' */
- p->rdev = p->replacement;
+ WRITE_ONCE(p->rdev, p->replacement);
clear_bit(Replacement, &p->replacement->flags);
- smp_mb(); /* Make sure other CPUs may see both as identical
- * but will never see neither -- if they are careful.
- */
- p->replacement = NULL;
+ WRITE_ONCE(p->replacement, NULL);
}

clear_bit(WantReplacement, &rdev->flags);
@@ -2725,20 +2672,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;

- rcu_read_lock();
do {
sector_t first_bad;
int bad_sectors;

d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2746,7 +2691,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
conf->tmppage,
REQ_OP_READ, false);
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
if (success)
break;
}
@@ -2754,7 +2698,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (sl == conf->copies)
sl = 0;
} while (sl != slot);
- rcu_read_unlock();

if (!success) {
/* Cannot read from anywhere, just mark the block
@@ -2778,20 +2721,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10

start = sl;
/* write it back and re-read */
- rcu_read_lock();
while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;

atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
if (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2810,7 +2751,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rdev->bdev);
}
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
}
sl = start;
while (sl != slot) {
@@ -2818,14 +2758,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl = conf->copies;
sl--;
d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;

atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
switch (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
@@ -2853,9 +2792,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}

rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
}
- rcu_read_unlock();

sectors -= s;
sect += s;
@@ -3329,14 +3266,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Completed a full sync so the replacements
* are now fully recovered.
*/
- rcu_read_lock();
for (i = 0; i < conf->geo.raid_disks; i++) {
struct md_rdev *rdev =
- rcu_dereference(conf->mirrors[i].replacement);
+ conf->mirrors[i].replacement;
+
if (rdev)
rdev->recovery_offset = MaxSector;
}
- rcu_read_unlock();
}
conf->fullsync = 0;
}
@@ -3417,9 +3353,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;

- rcu_read_lock();
- mrdev = rcu_dereference(mirror->rdev);
- mreplace = rcu_dereference(mirror->replacement);
+ mrdev = mirror->rdev;
+ mreplace = mirror->replacement;

if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
test_bit(In_sync, &mrdev->flags)))
@@ -3427,22 +3362,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mreplace && test_bit(Faulty, &mreplace->flags))
mreplace = NULL;

- if (!mrdev && !mreplace) {
- rcu_read_unlock();
+ if (!mrdev && !mreplace)
continue;
- }

still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
- if (sect >= mddev->resync_max_sectors) {
+ if (sect >= mddev->resync_max_sectors)
/* last stripe is not complete - don't
* try to recover this sector.
*/
- rcu_read_unlock();
continue;
- }
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
@@ -3458,14 +3389,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* that there will never be anything to do here
*/
chunks_skipped = -1;
- rcu_read_unlock();
continue;
}
if (mrdev)
atomic_inc(&mrdev->nr_pending);
if (mreplace)
atomic_inc(&mreplace->nr_pending);
- rcu_read_unlock();

r10_bio = raid10_alloc_init_r10buf(conf);
r10_bio->state = 0;
@@ -3484,10 +3413,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to check if the array will still be
* degraded
*/
- rcu_read_lock();
for (j = 0; j < conf->geo.raid_disks; j++) {
- struct md_rdev *rdev = rcu_dereference(
- conf->mirrors[j].rdev);
+ struct md_rdev *rdev = conf->mirrors[j].rdev;
+
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
still_degraded = 1;
break;
@@ -3502,8 +3430,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int k;
int d = r10_bio->devs[j].devnum;
sector_t from_addr, to_addr;
- struct md_rdev *rdev =
- rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t sector, first_bad;
int bad_sectors;
if (!rdev ||
@@ -3582,7 +3509,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_inc(&r10_bio->remaining);
break;
}
- rcu_read_unlock();
if (j == conf->copies) {
/* Cannot recover, so abort the recovery or
* record a bad block */
@@ -3709,12 +3635,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,

bio = r10_bio->devs[i].bio;
bio->bi_status = BLK_STS_IOERR;
- rcu_read_lock();
- rdev = rcu_dereference(conf->mirrors[d].rdev);
- if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ rdev = conf->mirrors[d].rdev;
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
continue;
- }
+
sector = r10_bio->devs[i].addr;
if (is_badblock(rdev, sector, max_sync,
&first_bad, &bad_sectors)) {
@@ -3724,7 +3648,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bad_sectors -= (sector - first_bad);
if (max_sync > bad_sectors)
max_sync = bad_sectors;
- rcu_read_unlock();
continue;
}
}
@@ -3740,11 +3663,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio_set_dev(bio, rdev->bdev);
count++;

- rdev = rcu_dereference(conf->mirrors[d].replacement);
- if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ rdev = conf->mirrors[d].replacement;
+ if (rdev == NULL || test_bit(Faulty, &rdev->flags))
continue;
- }
+
atomic_inc(&rdev->nr_pending);

/* Need to set up for writing to the replacement */
@@ -3761,7 +3683,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_iter.bi_sector = sector + rdev->data_offset;
bio_set_dev(bio, rdev->bdev);
count++;
- rcu_read_unlock();
}

if (count < 2) {
@@ -4471,11 +4392,11 @@ static int calc_degraded(struct r10conf *conf)
int degraded, degraded2;
int i;

- rcu_read_lock();
degraded = 0;
/* 'prev' section first */
for (i = 0; i < conf->prev.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (!test_bit(In_sync, &rdev->flags))
@@ -4485,13 +4406,12 @@ static int calc_degraded(struct r10conf *conf)
*/
degraded++;
}
- rcu_read_unlock();
if (conf->geo.raid_disks == conf->prev.raid_disks)
return degraded;
- rcu_read_lock();
degraded2 = 0;
for (i = 0; i < conf->geo.raid_disks; i++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++;
else if (!test_bit(In_sync, &rdev->flags)) {
@@ -4504,7 +4424,6 @@ static int calc_degraded(struct r10conf *conf)
degraded2++;
}
}
- rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
@@ -4936,16 +4855,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
blist = read_bio;
read_bio->bi_next = NULL;

- rcu_read_lock();
for (s = 0; s < conf->copies*2; s++) {
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev2;
if (s&1) {
- rdev2 = rcu_dereference(conf->mirrors[d].replacement);
+ rdev2 = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
- rdev2 = rcu_dereference(conf->mirrors[d].rdev);
+ rdev2 = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
@@ -4979,7 +4897,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
- rcu_read_unlock();
r10_bio->sectors = nr_sectors;

/* Now submit the read */
@@ -5032,20 +4949,17 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev;
- rcu_read_lock();
if (s&1) {
- rdev = rcu_dereference(conf->mirrors[d].replacement);
+ rdev = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ rdev = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
- rcu_read_unlock();
+ if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
- }
+
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
@@ -5116,10 +5030,9 @@ static int handle_reshape_read_error(struct mddev *mddev,
if (s > (PAGE_SIZE >> 9))
s = PAGE_SIZE >> 9;

- rcu_read_lock();
while (!success) {
int d = r10b->devs[slot].devnum;
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t addr;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags) ||
@@ -5128,14 +5041,12 @@ static int handle_reshape_read_error(struct mddev *mddev,

addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
success = sync_page_io(rdev,
addr,
s << 9,
pages[idx],
REQ_OP_READ, false);
rdev_dec_pending(rdev, mddev);
- rcu_read_lock();
if (success)
break;
failed:
@@ -5145,7 +5056,6 @@ static int handle_reshape_read_error(struct mddev *mddev,
if (slot == first_slot)
break;
}
- rcu_read_unlock();
if (!success) {
/* couldn't read this block, must give up */
set_bit(MD_RECOVERY_INTR,
@@ -5171,12 +5081,8 @@ static void end_reshape_write(struct bio *bio)
struct md_rdev *rdev = NULL;

d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
- if (repl)
- rdev = conf->mirrors[d].replacement;
- if (!rdev) {
- smp_mb();
- rdev = conf->mirrors[d].rdev;
- }
+ rdev = repl ? conf->mirrors[d].replacement :
+ conf->mirrors[d].rdev;

if (bio->bi_status) {
/* FIXME should record badblock */
@@ -5211,18 +5117,16 @@ static void raid10_finish_reshape(struct mddev *mddev)
mddev->resync_max_sectors = mddev->array_sectors;
} else {
int d;
- rcu_read_lock();
for (d = conf->geo.raid_disks ;
d < conf->geo.raid_disks - mddev->delta_disks;
d++) {
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = rcu_dereference(conf->mirrors[d].replacement);
+ rdev = conf->mirrors[d].replacement;
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
- rcu_read_unlock();
}
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
--
2.39.2

2023-10-18 17:57:19

by Song Liu

[permalink] [raw]

Subject: Re: [PATCH -next 3/6] md/raid1: remove rcu protection to access rdev from conf

On Sun, Oct 15, 2023 at 6:28 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> It's safe to accees rdev from conf:
> - If any spinlock is held, because synchronize_rcu() from
> md_kick_rdev_from_array() will prevent 'rdev' to be freed until
> spinlock is released;
> - If 'reconfig_lock' is held, because rdev can't be added or removed from
> array;

Maybe add lockdep asserts for the above cases?

Thanks,
Song

> - If there is normal IO inflight, because mddev_suspend() will prevent
> rdev to be added or removed from array;
> - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is
> checked in remove_and_add_spares().
>
> And these will cover all the scenarios in raid1.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid1.c | 57 +++++++++++++++++-----------------------------
> 1 file changed, 21 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 4348d670439d..5c647036663d 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> int choose_first;
> int choose_next_idle;
>
> - rcu_read_lock();
> /*
> * Check if we can balance. We can balance on the whole
> * device if no resync is going on, or below the resync window.
> @@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> unsigned int pending;
> bool nonrot;
>
> - rdev = rcu_dereference(conf->mirrors[disk].rdev);
> + rdev = conf->mirrors[disk].rdev;
> if (r1_bio->bios[disk] == IO_BLOCKED
> || rdev == NULL
> || test_bit(Faulty, &rdev->flags))
> @@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> }
>
> if (best_disk >= 0) {
> - rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
> + rdev = conf->mirrors[best_disk].rdev;
> if (!rdev)
> goto retry;
> atomic_inc(&rdev->nr_pending);
> @@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
>
> conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
> }
> - rcu_read_unlock();
> *max_sectors = sectors;
>
> return best_disk;
> @@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>
> if (r1bio_existed) {
> /* Need to get the block device name carefully */
> - struct md_rdev *rdev;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
> + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
> +
> if (rdev)
> snprintf(b, sizeof(b), "%pg", rdev->bdev);
> else
> strcpy(b, "???");
> - rcu_read_unlock();
> }
>
> /*
> @@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>
> disks = conf->raid_disks * 2;
> blocked_rdev = NULL;
> - rcu_read_lock();
> max_sectors = r1_bio->sectors;
> for (i = 0; i < disks; i++) {
> - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
> + struct md_rdev *rdev = conf->mirrors[i].rdev;
>
> /*
> * The write-behind io is only attempted on drives marked as
> @@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> }
> r1_bio->bios[i] = bio;
> }
> - rcu_read_unlock();
>
> if (unlikely(blocked_rdev)) {
> /* Wait for this device to become unblocked */
> @@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
> struct r1conf *conf = mddev->private;
> int i;
>
> + lockdep_assert_held(&mddev->lock);
> +
> seq_printf(seq, " [%d/%d] [", conf->raid_disks,
> conf->raid_disks - mddev->degraded);
> - rcu_read_lock();
> for (i = 0; i < conf->raid_disks; i++) {
> - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
> + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
> +
> seq_printf(seq, "%s",
> rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
> }
> - rcu_read_unlock();
> seq_printf(seq, "]");
> }
>
> @@ -1785,7 +1780,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> */
> if (rdev->saved_raid_disk < 0)
> conf->fullsync = 1;
> - rcu_assign_pointer(p->rdev, rdev);
> + WRITE_ONCE(p->rdev, rdev);
> break;
> }
> if (test_bit(WantReplacement, &p->rdev->flags) &&
> @@ -1801,7 +1796,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> rdev->raid_disk = repl_slot;
> err = 0;
> conf->fullsync = 1;
> - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
> + WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
> }
>
> return err;
> @@ -1835,7 +1830,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
> err = -EBUSY;
> goto abort;
> }
> - p->rdev = NULL;
> + WRITE_ONCE(p->rdev, NULL);
> if (conf->mirrors[conf->raid_disks + number].rdev) {
> /* We just removed a device that is being replaced.
> * Move down the replacement. We drain all IO before
> @@ -1856,7 +1851,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
> goto abort;
> }
> clear_bit(Replacement, &repl->flags);
> - p->rdev = repl;
> + WRITE_ONCE(p->rdev, repl);
> conf->mirrors[conf->raid_disks + number].rdev = NULL;
> unfreeze_array(conf);
> }
> @@ -2253,8 +2248,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> sector_t first_bad;
> int bad_sectors;
>
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> (test_bit(In_sync, &rdev->flags) ||
> (!test_bit(Faulty, &rdev->flags) &&
> @@ -2262,15 +2256,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> is_badblock(rdev, sect, s,
> &first_bad, &bad_sectors) == 0) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> if (sync_page_io(rdev, sect, s<<9,
> conf->tmppage, REQ_OP_READ, false))
> success = 1;
> rdev_dec_pending(rdev, mddev);
> if (success)
> break;
> - } else
> - rcu_read_unlock();
> + }
> +
> d++;
> if (d == conf->raid_disks * 2)
> d = 0;
> @@ -2289,29 +2282,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> if (d==0)
> d = conf->raid_disks * 2;
> d--;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> !test_bit(Faulty, &rdev->flags)) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> r1_sync_page_io(rdev, sect, s,
> conf->tmppage, WRITE);
> rdev_dec_pending(rdev, mddev);
> - } else
> - rcu_read_unlock();
> + }
> }
> d = start;
> while (d != read_disk) {
> if (d==0)
> d = conf->raid_disks * 2;
> d--;
> - rcu_read_lock();
> - rdev = rcu_dereference(conf->mirrors[d].rdev);
> + rdev = conf->mirrors[d].rdev;
> if (rdev &&
> !test_bit(Faulty, &rdev->flags)) {
> atomic_inc(&rdev->nr_pending);
> - rcu_read_unlock();
> if (r1_sync_page_io(rdev, sect, s,
> conf->tmppage, READ)) {
> atomic_add(s, &rdev->corrected_errors);
> @@ -2322,8 +2310,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
> rdev->bdev);
> }
> rdev_dec_pending(rdev, mddev);
> - } else
> - rcu_read_unlock();
> + }
> }
> sectors -= s;
> sect += s;
> @@ -2704,7 +2691,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
>
> r1_bio = raid1_alloc_init_r1buf(conf);
>
> - rcu_read_lock();
> /*
> * If we get a correctably read error during resync or recovery,
> * we might want to read from a different device. So we
> @@ -2725,7 +2711,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> struct md_rdev *rdev;
> bio = r1_bio->bios[i];
>
> - rdev = rcu_dereference(conf->mirrors[i].rdev);
> + rdev = conf->mirrors[i].rdev;
> if (rdev == NULL ||
> test_bit(Faulty, &rdev->flags)) {
> if (i < conf->raid_disks)
> @@ -2783,7 +2769,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> bio->bi_opf |= MD_FAILFAST;
> }
> }
> - rcu_read_unlock();
> if (disk < 0)
> disk = wonly;
> r1_bio->read_disk = disk;
> --
> 2.39.2
>

2023-10-18 17:59:25

by Song Liu

[permalink] [raw]

Subject: Re: [PATCH -next 0/6] md: remvoe rcu protection to access rdev from conf

On Sun, Oct 15, 2023 at 6:28 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Yu Kuai (6):
> md: remove useless debug code to print configuration
> md: remove flag RemoveSynchronized
> md/raid1: remove rcu protection to access rdev from conf
> md/raid10: remove rcu protection to access rdev from conf
> md/raid5: remove rcu protection to access rdev from conf
> md/md-multipath: remove rcu protection to access rdev from conf

While a cover letter is highly recommended for a patchset. An empty
cover letter like this doesn't really add value. If the description of each
patch is clear enough, it is OK to not have a cover-letter.

Thanks,
Song

>
> drivers/md/md-multipath.c | 29 ++---
> drivers/md/md.c | 37 +-----
> drivers/md/raid1.c | 94 ++++-----------
> drivers/md/raid10.c | 248 +++++++++-----------------------------
> drivers/md/raid5-cache.c | 11 +-
> drivers/md/raid5-ppl.c | 16 +--
> drivers/md/raid5.c | 225 ++++++++++------------------------
> drivers/md/raid5.h | 4 +-
> 8 files changed, 163 insertions(+), 501 deletions(-)
>
> --
> 2.39.2
>

2023-10-19 01:04:56

by Yu Kuai

[permalink] [raw]

Subject: Re: [PATCH -next 0/6] md: remvoe rcu protection to access rdev from conf

Hi,

在 2023/10/19 1:58, Song Liu 写道:
> On Sun, Oct 15, 2023 at 6:28 PM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> Yu Kuai (6):
>> md: remove useless debug code to print configuration
>> md: remove flag RemoveSynchronized
>> md/raid1: remove rcu protection to access rdev from conf
>> md/raid10: remove rcu protection to access rdev from conf
>> md/raid5: remove rcu protection to access rdev from conf
>> md/md-multipath: remove rcu protection to access rdev from conf
>
> While a cover letter is highly recommended for a patchset. An empty
> cover letter like this doesn't really add value. If the description of each
> patch is clear enough, it is OK to not have a cover-letter.

Yes, I was hoping commit message in each patch is clear enough,I'll try
to explain everything more detailed.

Thanks,
Kuai

>
> Thanks,
> Song
>
>>
>> drivers/md/md-multipath.c | 29 ++---
>> drivers/md/md.c | 37 +-----
>> drivers/md/raid1.c | 94 ++++-----------
>> drivers/md/raid10.c | 248 +++++++++-----------------------------
>> drivers/md/raid5-cache.c | 11 +-
>> drivers/md/raid5-ppl.c | 16 +--
>> drivers/md/raid5.c | 225 ++++++++++------------------------
>> drivers/md/raid5.h | 4 +-
>> 8 files changed, 163 insertions(+), 501 deletions(-)
>>
>> --
>> 2.39.2
>>
> .
>

2023-10-19 01:59:51

by Yu Kuai

[permalink] [raw]

Subject: Re: [PATCH -next 0/6] md: remvoe rcu protection to access rdev from conf

Hi,

在 2023/10/19 9:04, Yu Kuai 写道:
> Hi,
>
> 在 2023/10/19 1:58, Song Liu 写道:
>> On Sun, Oct 15, 2023 at 6:28 PM Yu Kuai <[email protected]> wrote:
>>>
>>> From: Yu Kuai <[email protected]>
>>>
>>> Yu Kuai (6):
>>>    md: remove useless debug code to print configuration
>>>    md: remove flag RemoveSynchronized
>>>    md/raid1: remove rcu protection to access rdev from conf
>>>    md/raid10: remove rcu protection to access rdev from conf
>>>    md/raid5: remove rcu protection to access rdev from conf
>>>    md/md-multipath: remove rcu protection to access rdev from conf
>>
>> While a cover letter is highly recommended for a patchset. An empty
>> cover letter like this doesn't really add value. If the description of
>> each
>> patch is clear enough, it is OK to not have a cover-letter.
>
> Yes, I was hoping commit message in each patch is clear enough,I'll try
> to explain everything more detailed.

I'll add following cover letter in the next version:

The lifetime of rdev:

1. md_import_device() generate a rdev based on underlying disk;

mddev_lock()
rdev = kzalloc();
rdev->bdev = blkdev_get_by_dev();
mddev_unlock()

2. bind_rdev_to_array() add this rdev to mddev->disks;

mddev_lock()
kobject_add(&rdev->kobj, &mddev->kobj, ...);
list_add_rcu(&rdev->same_set, &mddev->disks);
mddev_unlock()

3. remove_and_add_spares() add this rdev to conf;

mddev_lock()
rdev_addable();
pers->hot_add_disk();
rcu_assign_pointer(conf->rdev, rdev);
mddev_unlock()

4. Use this array with rdev;

5. remove_and_add_spares() remove rdev from conf;

// triggered by sysfs/ioctl
mddev_lock()
rdev_removeable();
pers->hot_remove_disk();
rcu_assign_pointer(conf->rdev, NULL);
synchronize_rcu();
mddev_unlock()

// triggered by deamon
mddev_lock()
rdev_removeable();
synchronize_rcu(); -> this can't protect accessing rdev from conf
pers->hot_remove_disk();
rcu_assign_pointer(conf->rdev, NULL);
mddev_unlock()

6. md_kick_rdev_from_array() remove rdev from mddev->disks;

mddev_lock()
list_del_rcu(&rdev->same_set);
synchronize_rcu();
list_add(&rdev->same_set, &mddev->deleting)
mddev_unlock()
export_rdev

There are two seperate rcu protection for rdev, and this pathset remove
the protection of conf(step 3 and 5), because it's safe to access rdev
from conf in following cases:

- If 'reconfig_mutex' is held, because rdev can't be added or rmoved to
conf;
- If there is normal IO inflight, because mddev_suspend() will prevent
rdev to be added or removed to conf;
- If sync thread is running, because remove_and_add_spares() can only
be called from daemon thread when sync thread is done, and
'MD_RECOVERY_RUNNING' is also checked for ioctl/sysfs;
- if rcu_read_lock() or any spinlock is held, because synchronize_rcu()
from step 6 prevent rdev to be freed until rcu_read_unlock() or
spinlock is released;

Thanks,
Kuai

>
> Thanks,
> Kuai
>
>>
>> Thanks,
>> Song
>>
>>>
>>> drivers/md/md-multipath.c | 29 ++---
>>> drivers/md/md.c           | 37 +-----
>>> drivers/md/raid1.c        | 94 ++++-----------
>>> drivers/md/raid10.c       | 248 +++++++++-----------------------------
>>> drivers/md/raid5-cache.c | 11 +-
>>> drivers/md/raid5-ppl.c    | 16 +--
>>> drivers/md/raid5.c        | 225 ++++++++++------------------------
>>> drivers/md/raid5.h        |   4 +-
>>> 8 files changed, 163 insertions(+), 501 deletions(-)
>>>
>>> --
>>> 2.39.2
>>>
>> .
>>
>
> .
>