From: Yu Kuai <[email protected]>
Changes in v3:
- add patch 2, and fix that setup_conf() is missing in patch3;
- add some review tag from Xiao Ni(other than patch 2,3);
Changes in v2:
- add new conter in conf for patch 2;
- fix the case choose next idle while there is no other idle disk in
patch 3;
- add some review tag from Xiao Ni for patch 1, 4-8
The original idea is that Paul want to optimize raid1 read
performance([1]), however, we think that the original code for
read_balance() is quite complex, and we don't want to add more
complexity. Hence we decide to refactor read_balance() first, to make
code cleaner and easier for follow up.
Before this patchset, read_balance() has many local variables and many
branches, it want to consider all the scenarios in one iteration. The
idea of this patch is to divide them into 4 different steps:
1) If resync is in progress, find the first usable disk, patch 5;
Otherwise:
2) Loop through all disks and skipping slow disks and disks with bad
blocks, choose the best disk, patch 10. If no disk is found:
3) Look for disks with bad blocks and choose the one with most number of
sectors, patch 8. If no disk is found:
4) Choose first found slow disk with no bad blocks, or slow disk with
most number of sectors, patch 7.
Note that step 3) and step 4) are super code path, and performance
should not be considered.
And after this patchset, we'll continue to optimize read_balance for
step 2), specifically how to choose the best rdev to read.
[1] https://lore.kernel.org/all/[email protected]/
Yu Kuai (11):
md: add a new helper rdev_has_badblock()
md/raid1: factor out helpers to add rdev to conf
md/raid1: record nonrot rdevs while adding/removing rdevs to conf
md/raid1: fix choose next idle in read_balance()
md/raid1-10: add a helper raid1_check_read_range()
md/raid1-10: factor out a new helper raid1_should_read_first()
md/raid1: factor out read_first_rdev() from read_balance()
md/raid1: factor out choose_slow_rdev() from read_balance()
md/raid1: factor out choose_bb_rdev() from read_balance()
md/raid1: factor out the code to manage sequential IO
md/raid1: factor out helpers to choose the best rdev from
read_balance()
drivers/md/md.h | 11 +
drivers/md/raid1-10.c | 69 ++++++
drivers/md/raid1.c | 539 +++++++++++++++++++++++++-----------------
drivers/md/raid1.h | 1 +
drivers/md/raid10.c | 58 ++---
drivers/md/raid5.c | 35 +--
6 files changed, 437 insertions(+), 276 deletions(-)
--
2.39.2
From: Yu Kuai <[email protected]>
The way that best rdev is chosen:
1) If the read is sequential from one rdev:
- if rdev is rotational, use this rdev;
- if rdev is non-rotational, use this rdev until total read length
exceed disk opt io size;
2) If the read is not sequential:
- if there is idle disk, use it, otherwise:
- if the array has non-rotational disk, choose the rdev with minimal
inflight IO;
- if all the underlaying disks are rotational disk, choose the rdev
with closest IO;
There are no functional changes, just to make code cleaner and prepare
for following refactor.
Co-developed-by: Paul Luse <[email protected]>
Signed-off-by: Paul Luse <[email protected]>
Signed-off-by: Yu Kuai <[email protected]>
Reviewed-by: Xiao Ni <[email protected]>
---
drivers/md/raid1.c | 175 +++++++++++++++++++++++++--------------------
1 file changed, 98 insertions(+), 77 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 526f0d977040..c00f2aefbc56 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -730,74 +730,71 @@ static bool should_choose_next(struct r1conf *conf, int disk)
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
}
-/*
- * This routine returns the disk from which the requested read should
- * be done. There is a per-array 'next expected sequential IO' sector
- * number - if this matches on the next IO then we use the last disk.
- * There is also a per-disk 'last know head position' sector that is
- * maintained from IRQ contexts, both the normal and the resync IO
- * completion handlers update this position correctly. If there is no
- * perfect sequential match then we pick the disk whose head is closest.
- *
- * If there are 2 mirrors in the same 2 devices, performance degrades
- * because position is mirror, not device based.
- *
- * The rdev for the device selected will have nr_pending incremented.
- */
-static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
{
- const sector_t this_sector = r1_bio->sector;
- int sectors;
- int best_good_sectors;
- int best_disk, best_dist_disk, best_pending_disk, sequential_disk;
- int disk;
- sector_t best_dist;
- unsigned int min_pending;
- struct md_rdev *rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ return false;
- retry:
- sectors = r1_bio->sectors;
- best_disk = -1;
- best_dist_disk = -1;
- sequential_disk = -1;
- best_dist = MaxSector;
- best_pending_disk = -1;
- min_pending = UINT_MAX;
- best_good_sectors = 0;
- clear_bit(R1BIO_FailFast, &r1_bio->state);
+ /* still in recovery */
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+ return false;
- if (raid1_should_read_first(conf->mddev, this_sector, sectors))
- return choose_first_rdev(conf, r1_bio, max_sectors);
+ /* don't read from slow disk unless have to */
+ if (test_bit(WriteMostly, &rdev->flags))
+ return false;
+
+ /* don't split IO for bad blocks unless have to */
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
+ return false;
+
+ return true;
+}
+
+struct read_balance_ctl {
+ sector_t closest_dist;
+ int closest_dist_disk;
+ int min_pending;
+ int min_pending_disk;
+ int sequential_disk;
+ int readable_disks;
+};
+
+static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int disk;
+ struct read_balance_ctl ctl = {
+ .closest_dist_disk = -1,
+ .closest_dist = MaxSector,
+ .min_pending_disk = -1,
+ .min_pending = UINT_MAX,
+ .sequential_disk = -1,
+ };
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
sector_t dist;
unsigned int pending;
- rdev = conf->mirrors[disk].rdev;
- if (r1_bio->bios[disk] == IO_BLOCKED
- || rdev == NULL
- || test_bit(Faulty, &rdev->flags))
- continue;
- if (!test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < this_sector + sectors)
- continue;
- if (test_bit(WriteMostly, &rdev->flags))
+ if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
- if (rdev_has_badblock(rdev, this_sector, sectors))
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev_readable(rdev, r1_bio))
continue;
- if (best_disk >= 0)
- /* At least two disks to choose from so failfast is OK */
+ /* At least two disks to choose from so failfast is OK */
+ if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state);
pending = atomic_read(&rdev->nr_pending);
- dist = abs(this_sector - conf->mirrors[disk].head_position);
+ dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
+
/* Don't change to another disk for sequential reads */
if (is_sequential(conf, disk, r1_bio)) {
- if (!should_choose_next(conf, disk)) {
- best_disk = disk;
- break;
- }
+ if (!should_choose_next(conf, disk))
+ return disk;
+
/*
* Add 'pending' to avoid choosing this disk if
* there is other idle disk.
@@ -807,17 +804,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* If there is no other idle disk, this disk
* will be chosen.
*/
- sequential_disk = disk;
+ ctl.sequential_disk = disk;
}
- if (min_pending > pending) {
- min_pending = pending;
- best_pending_disk = disk;
+ if (ctl.min_pending > pending) {
+ ctl.min_pending = pending;
+ ctl.min_pending_disk = disk;
}
- if (dist < best_dist) {
- best_dist = dist;
- best_dist_disk = disk;
+ if (ctl.closest_dist > dist) {
+ ctl.closest_dist = dist;
+ ctl.closest_dist_disk = disk;
}
}
@@ -825,8 +822,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* sequential IO size exceeds optimal iosize, however, there is no other
* idle disk, so choose the sequential disk.
*/
- if (best_disk == -1 && min_pending != 0)
- best_disk = sequential_disk;
+ if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
+ return ctl.sequential_disk;
/*
* If all disks are rotational, choose the closest disk. If any disk is
@@ -834,25 +831,49 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
- if (best_disk == -1) {
- if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
- best_disk = best_pending_disk;
- else
- best_disk = best_dist_disk;
- }
+ if (ctl.min_pending_disk != -1 &&
+ (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
+ return ctl.min_pending_disk;
+ else
+ return ctl.closest_dist_disk;
+}
- if (best_disk >= 0) {
- rdev = conf->mirrors[best_disk].rdev;
- if (!rdev)
- goto retry;
+/*
+ * This routine returns the disk from which the requested read should be done.
+ *
+ * 1) If resync is in progress, find the first usable disk and use it even if it
+ * has some bad blocks.
+ *
+ * 2) Now that there is no resync, loop through all disks and skipping slow
+ * disks and disks with bad blocks for now. Only pay attention to key disk
+ * choice.
+ *
+ * 3) If we've made it this far, now look for disks with bad blocks and choose
+ * the one with most number of sectors.
+ *
+ * 4) If we are all the way at the end, we have no choice but to use a disk even
+ * if it is write mostly.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ int disk;
- sectors = best_good_sectors;
- update_read_sectors(conf, disk, this_sector, sectors);
- }
- *max_sectors = sectors;
+ clear_bit(R1BIO_FailFast, &r1_bio->state);
+
+ if (raid1_should_read_first(conf->mddev, r1_bio->sector,
+ r1_bio->sectors))
+ return choose_first_rdev(conf, r1_bio, max_sectors);
- if (best_disk >= 0)
- return best_disk;
+ disk = choose_best_rdev(conf, r1_bio);
+ if (disk >= 0) {
+ *max_sectors = r1_bio->sectors;
+ update_read_sectors(conf, disk, r1_bio->sector,
+ r1_bio->sectors);
+ return disk;
+ }
/*
* If we are here it means we didn't find a perfectly good disk so
--
2.39.2
From: Yu Kuai <[email protected]>
There are no functional changes, just make code cleaner and prepare to
record disk non-rotational information while adding and removing rdev to
conf
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1.c | 74 ++++++++++++++++++++++++++++------------------
1 file changed, 46 insertions(+), 28 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a145fe48b9ce..1940ff398c23 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1757,6 +1757,40 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+
+ if (info->rdev)
+ return false;
+
+ rdev->raid_disk = disk;
+ info->head_position = 0;
+ info->seq_start = MaxSector;
+ WRITE_ONCE(info->rdev, rdev);
+
+ return true;
+}
+
+static bool raid1_remove_conf(struct r1conf *conf, int disk)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+ struct md_rdev *rdev = info->rdev;
+
+ if (!rdev || test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending))
+ return false;
+
+ /* Only remove non-faulty devices if recovery is not possible. */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ rdev->mddev->recovery_disabled != conf->recovery_disabled &&
+ rdev->mddev->degraded < conf->raid_disks)
+ return false;
+
+ WRITE_ONCE(info->rdev, NULL);
+ return true;
+}
+
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
@@ -1792,15 +1826,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
- p->head_position = 0;
- rdev->raid_disk = mirror;
+ raid1_add_conf(conf, rdev, mirror);
err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- WRITE_ONCE(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1810,13 +1842,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
- p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = repl_slot;
+ raid1_add_conf(conf, rdev, repl_slot);
err = 0;
conf->fullsync = 1;
- WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
@@ -1833,27 +1863,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks))
goto abort;
- if (rdev != p->rdev)
- p = conf->mirrors + conf->raid_disks + number;
+ if (rdev != p->rdev) {
+ number += conf->raid_disks;
+ p = conf->mirrors + number;
+ }
print_conf(conf);
if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
- err = -EBUSY;
- goto abort;
- }
- /* Only remove non-faulty devices if recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != conf->recovery_disabled &&
- mddev->degraded < conf->raid_disks) {
+ if (!raid1_remove_conf(conf, number)) {
err = -EBUSY;
goto abort;
}
- WRITE_ONCE(p->rdev, NULL);
- if (conf->mirrors[conf->raid_disks + number].rdev) {
+
+ if (number < conf->raid_disks &&
+ conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
@@ -3000,15 +3023,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|| disk_idx < 0)
continue;
if (test_bit(Replacement, &rdev->flags))
- disk = conf->mirrors + mddev->raid_disks + disk_idx;
- else
- disk = conf->mirrors + disk_idx;
+ disk_idx += mddev->raid_disks;
- if (disk->rdev)
+ if (!raid1_add_conf(conf, rdev, disk_idx))
goto abort;
- disk->rdev = rdev;
- disk->head_position = 0;
- disk->seq_start = MaxSector;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
--
2.39.2
From: Yu Kuai <[email protected]>
The current api is_badblock() must pass in 'first_bad' and
'bad_sectors', however, many caller just want to know if there are
badblocks or not, and these caller must define two local variable that
will never be used.
Add a new helper rdev_has_badblock() that will only return if there are
badblocks or not, remove unnecessary local variables and replace
is_badblock() with the new helper in many places.
There are no functional changes, and the new helper will also be used
later to refactor read_balance().
Co-developed-by: Paul Luse <[email protected]>
Signed-off-by: Paul Luse <[email protected]>
Signed-off-by: Yu Kuai <[email protected]>
Reviewed-by: Xiao Ni <[email protected]>
---
drivers/md/md.h | 10 ++++++++++
drivers/md/raid1.c | 26 +++++++-------------------
drivers/md/raid10.c | 45 ++++++++++++++-------------------------------
drivers/md/raid5.c | 35 +++++++++++++----------------------
4 files changed, 44 insertions(+), 72 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8d881cc59799..a49ab04ab707 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -222,6 +222,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
}
return 0;
}
+
+static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
+ int sectors)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
+}
+
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 286f8b16c7bd..a145fe48b9ce 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
- sector_t first_bad;
- int bad_sectors;
-
r1_bio->bios[mirror] = NULL;
to_put = bio;
/*
@@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
@@ -1944,8 +1941,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
- sector_t first_bad;
- int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
@@ -1955,14 +1950,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
- } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) &&
- !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
- r1_bio->sector,
- r1_bio->sectors,
- &first_bad, &bad_sectors)
- )
+ } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector, r1_bio->sectors)) {
set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
put_sync_write_buf(r1_bio, uptodate);
}
@@ -2279,16 +2271,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) &&
- is_badblock(rdev, sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev, sect, s) == 0) {
atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7412066ea22c..d5a7a621f0f0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will
* wait for the 'master' bio.
- */
- sector_t first_bad;
- int bad_sectors;
-
- /*
+ *
* Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could
@@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors) &&
+ !discard_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@@ -1330,10 +1325,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
}
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
- int is_bad;
/*
* Discard request doesn't care the write result
@@ -1342,9 +1334,8 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
if (!r10_bio->sectors)
continue;
- is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
- &first_bad, &bad_sectors);
- if (is_bad < 0) {
+ if (rdev_has_badblock(rdev, dev_sector,
+ r10_bio->sectors) < 0) {
/*
* Mustn't write here until the bad block
* is acknowledged
@@ -2290,8 +2281,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
- sector_t first_bad;
- int bad_sectors;
int slot;
int repl;
struct md_rdev *rdev = NULL;
@@ -2312,11 +2301,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
- } else if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors))
+ } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
+ }
rdev_dec_pending(rdev, mddev);
@@ -2597,11 +2585,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op)
{
- sector_t first_bad;
- int bad_sectors;
-
- if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
- && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
+ if (rdev_has_badblock(rdev, sector, sectors) &&
+ (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */
@@ -2658,16 +2643,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
- is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev,
+ r10_bio->devs[sl].addr + sect,
+ s) == 0) {
atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 14f2cf75abbd..9241e95ef55c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1210,10 +1210,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
*/
while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
- int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ int bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (!bad)
break;
@@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int replacement = 0;
for (i = 0 ; i < disks; i++) {
@@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) {
if (bi->bi_status)
md_error(conf->mddev, rdev);
- else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
@@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
- } else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors)) {
+ } else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf))) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make
@@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */
for (i=disks; i--; ) {
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int is_bad = 0;
dev = &sh->dev[i];
@@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
- !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ !rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev && !test_bit(Faulty, &rdev->flags))
@@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ is_bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) {
@@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private;
struct bio *align_bio;
struct md_rdev *rdev;
- sector_t sector, end_sector, first_bad;
- int bad_sectors, dd_idx;
+ sector_t sector, end_sector;
+ int dd_idx;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending);
- if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
- &bad_sectors)) {
+ if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
rdev_dec_pending(rdev, mddev);
return 0;
}
--
2.39.2
From: Yu Kuai <[email protected]>
Commit 12cee5a8a29e ("md/raid1: prevent merging too large request") add
the case choose next idle in read_balance():
read_balance:
for_each_rdev
if(next_seq_sect == this_sector || dist == 0)
-> sequential reads
best_disk = disk;
if (...)
choose_next_idle = 1
continue;
for_each_rdev
-> iterate next rdev
if (pending == 0)
best_disk = disk;
-> choose the next idle disk
break;
if (choose_next_idle)
-> keep using this rdev if there are no other idle disk
contine
However, commit 2e52d449bcec ("md/raid1: add failfast handling for reads.")
remove the code:
- /* If device is idle, use it */
- if (pending == 0) {
- best_disk = disk;
- break;
- }
Hence choose next idle will never work now, fix this problem by
following:
1) don't set best_disk in this case, read_balance() will choose the best
disk after iterating all the disks;
2) add 'pending' so that other idle disk will be chosen;
3) add a new local variable 'sequential_disk' to record the disk, and if
there is no other idle disk, 'sequential_disk' will be chosen;
Fixes: 2e52d449bcec ("md/raid1: add failfast handling for reads.")
Co-developed-by: Paul Luse <[email protected]>
Signed-off-by: Paul Luse <[email protected]>
Signed-off-by: Yu Kuai <[email protected]>
Reviewed-by: Xiao Ni <[email protected]>
---
drivers/md/raid1.c | 32 ++++++++++++++++++++++----------
1 file changed, 22 insertions(+), 10 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 032a6a6c3730..97db9add27df 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -598,13 +598,12 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
- int best_disk, best_dist_disk, best_pending_disk;
+ int best_disk, best_dist_disk, best_pending_disk, sequential_disk;
int disk;
sector_t best_dist;
unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
- int choose_next_idle;
/*
* Check if we can balance. We can balance on the whole
@@ -615,11 +614,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
sectors = r1_bio->sectors;
best_disk = -1;
best_dist_disk = -1;
+ sequential_disk = -1;
best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
- choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
@@ -712,7 +711,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
struct raid1_info *mirror = &conf->mirrors[disk];
- best_disk = disk;
/*
* If buffered sequential IO size exceeds optimal
* iosize, check if there is idle disk. If yes, choose
@@ -731,15 +729,22 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
mirror->seq_start) {
- choose_next_idle = 1;
- continue;
+ /*
+ * Add 'pending' to avoid choosing this disk if
+ * there is other idle disk.
+ */
+ pending++;
+ /*
+ * If there is no other idle disk, this disk
+ * will be chosen.
+ */
+ sequential_disk = disk;
+ } else {
+ best_disk = disk;
+ break;
}
- break;
}
- if (choose_next_idle)
- continue;
-
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
@@ -751,6 +756,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
}
}
+ /*
+ * sequential IO size exceeds optimal iosize, however, there is no other
+ * idle disk, so choose the sequential disk.
+ */
+ if (best_disk == -1 && min_pending != 0)
+ best_disk = sequential_disk;
+
/*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
--
2.39.2
On Wed, Feb 28, 2024 at 7:49 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Changes in v3:
> - add patch 2, and fix that setup_conf() is missing in patch3;
> - add some review tag from Xiao Ni(other than patch 2,3);
> Changes in v2:
> - add new conter in conf for patch 2;
> - fix the case choose next idle while there is no other idle disk in
> patch 3;
> - add some review tag from Xiao Ni for patch 1, 4-8
>
> The original idea is that Paul want to optimize raid1 read
> performance([1]), however, we think that the original code for
> read_balance() is quite complex, and we don't want to add more
> complexity. Hence we decide to refactor read_balance() first, to make
> code cleaner and easier for follow up.
>
> Before this patchset, read_balance() has many local variables and many
> branches, it want to consider all the scenarios in one iteration. The
> idea of this patch is to divide them into 4 different steps:
>
> 1) If resync is in progress, find the first usable disk, patch 5;
> Otherwise:
> 2) Loop through all disks and skipping slow disks and disks with bad
> blocks, choose the best disk, patch 10. If no disk is found:
> 3) Look for disks with bad blocks and choose the one with most number of
> sectors, patch 8. If no disk is found:
> 4) Choose first found slow disk with no bad blocks, or slow disk with
> most number of sectors, patch 7.
>
> Note that step 3) and step 4) are super code path, and performance
> should not be considered.
>
> And after this patchset, we'll continue to optimize read_balance for
> step 2), specifically how to choose the best rdev to read.
>
> [1] https://lore.kernel.org/all/[email protected]/
>
> Yu Kuai (11):
> md: add a new helper rdev_has_badblock()
> md/raid1: factor out helpers to add rdev to conf
> md/raid1: record nonrot rdevs while adding/removing rdevs to conf
> md/raid1: fix choose next idle in read_balance()
> md/raid1-10: add a helper raid1_check_read_range()
> md/raid1-10: factor out a new helper raid1_should_read_first()
> md/raid1: factor out read_first_rdev() from read_balance()
> md/raid1: factor out choose_slow_rdev() from read_balance()
> md/raid1: factor out choose_bb_rdev() from read_balance()
> md/raid1: factor out the code to manage sequential IO
> md/raid1: factor out helpers to choose the best rdev from
> read_balance()
>
> drivers/md/md.h | 11 +
> drivers/md/raid1-10.c | 69 ++++++
> drivers/md/raid1.c | 539 +++++++++++++++++++++++++-----------------
> drivers/md/raid1.h | 1 +
> drivers/md/raid10.c | 58 ++---
> drivers/md/raid5.c | 35 +--
> 6 files changed, 437 insertions(+), 276 deletions(-)
>
> --
> 2.39.2
>
Hi Kuai
This patch set looks good to me. By the way, have you run mdadm
regression tests?
Reviewed-by: Xiao Ni <[email protected]>
On Wed, Feb 28, 2024 at 3:49 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Changes in v3:
> - add patch 2, and fix that setup_conf() is missing in patch3;
> - add some review tag from Xiao Ni(other than patch 2,3);
It appears that v3 causes mdadm test "01replace" to run forever.
I haven't figured out why, but v2 doesn't seem to have this issue.
Thanks,
Song
> Changes in v2:
> - add new conter in conf for patch 2;
> - fix the case choose next idle while there is no other idle disk in
> patch 3;
> - add some review tag from Xiao Ni for patch 1, 4-8
>
> The original idea is that Paul want to optimize raid1 read
> performance([1]), however, we think that the original code for
> read_balance() is quite complex, and we don't want to add more
> complexity. Hence we decide to refactor read_balance() first, to make
> code cleaner and easier for follow up.
Hi,
在 2024/02/29 5:23, Song Liu 写道:
> On Wed, Feb 28, 2024 at 3:49 AM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> Changes in v3:
>> - add patch 2, and fix that setup_conf() is missing in patch3;
>> - add some review tag from Xiao Ni(other than patch 2,3);
>
> It appears that v3 causes mdadm test "01replace" to run forever.
> I haven't figured out why, but v2 doesn't seem to have this issue.
I'm running tests while sending this version, and I found that too this
morning. :(
I'll check out what's wrong and update soon.
Thanks,
Kuai
>
> Thanks,
> Song
>
>> Changes in v2:
>> - add new conter in conf for patch 2;
>> - fix the case choose next idle while there is no other idle disk in
>> patch 3;
>> - add some review tag from Xiao Ni for patch 1, 4-8
>>
>> The original idea is that Paul want to optimize raid1 read
>> performance([1]), however, we think that the original code for
>> read_balance() is quite complex, and we don't want to add more
>> complexity. Hence we decide to refactor read_balance() first, to make
>> code cleaner and easier for follow up.
>
> .
>
Hi,
?? 2024/02/28 19:43, Yu Kuai д??:
> From: Yu Kuai <[email protected]>
>
> There are no functional changes, just make code cleaner and prepare to
> record disk non-rotational information while adding and removing rdev to
> conf
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid1.c | 74 ++++++++++++++++++++++++++++------------------
> 1 file changed, 46 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index a145fe48b9ce..1940ff398c23 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1757,6 +1757,40 @@ static int raid1_spare_active(struct mddev *mddev)
> return count;
> }
>
> +static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk)
> +{
> + struct raid1_info *info = conf->mirrors + disk;
> +
> + if (info->rdev)
> + return false;
> +
> + rdev->raid_disk = disk;
> + info->head_position = 0;
> + info->seq_start = MaxSector;
> + WRITE_ONCE(info->rdev, rdev);
> +
> + return true;
> +}
I misread the code, for replacement, rdev->raid_disk is the same as the
original rdev, while this patch set it to "raid_disk + conf->raid_disks"
and following should be merged with this patch:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1940ff398c23..ff1e1aeaf336 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1757,10 +1757,14 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
-static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev,
int disk)
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev,
int disk,
+ bool replacement)
{
struct raid1_info *info = conf->mirrors + disk;
+ if (replacement)
+ info += conf->raid_disks;
+
if (info->rdev)
return false;
@@ -1826,7 +1830,7 @@ static int raid1_add_disk(struct mddev *mddev,
struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk,
rdev->bdev,
rdev->data_offset << 9);
- raid1_add_conf(conf, rdev, mirror);
+ raid1_add_conf(conf, rdev, mirror, false);
err = 0;
/* As all devices are equivalent, we don't need
a full recovery
* if this was recently any drive of the array
@@ -1844,7 +1848,7 @@ static int raid1_add_disk(struct mddev *mddev,
struct md_rdev *rdev)
/* Add this device as a replacement */
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
- raid1_add_conf(conf, rdev, repl_slot);
+ raid1_add_conf(conf, rdev, repl_slot, true);
err = 0;
conf->fullsync = 1;
}
@@ -3017,18 +3021,16 @@ static struct r1conf *setup_conf(struct mddev
*mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
+ conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
- if (disk_idx >= mddev->raid_disks
+ if (disk_idx >= conf->raid_disks
|| disk_idx < 0)
continue;
- if (test_bit(Replacement, &rdev->flags))
- disk_idx += mddev->raid_disks;
-
- if (!raid1_add_conf(conf, rdev, disk_idx))
+ if (!raid1_add_conf(conf, rdev, disk_idx,
+ test_bit(Replacement, &rdev->flags)))
goto abort;
}
- conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
I'll finish mdadm tests suite before v4.
Thanks,
Kuai
> +
> +static bool raid1_remove_conf(struct r1conf *conf, int disk)
> +{
> + struct raid1_info *info = conf->mirrors + disk;
> + struct md_rdev *rdev = info->rdev;
> +
> + if (!rdev || test_bit(In_sync, &rdev->flags) ||
> + atomic_read(&rdev->nr_pending))
> + return false;
> +
> + /* Only remove non-faulty devices if recovery is not possible. */
> + if (!test_bit(Faulty, &rdev->flags) &&
> + rdev->mddev->recovery_disabled != conf->recovery_disabled &&
> + rdev->mddev->degraded < conf->raid_disks)
> + return false;
> +
> + WRITE_ONCE(info->rdev, NULL);
> + return true;
> +}
> +
> static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> {
> struct r1conf *conf = mddev->private;
> @@ -1792,15 +1826,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
> disk_stack_limits(mddev->gendisk, rdev->bdev,
> rdev->data_offset << 9);
>
> - p->head_position = 0;
> - rdev->raid_disk = mirror;
> + raid1_add_conf(conf, rdev, mirror);
> err = 0;
> /* As all devices are equivalent, we don't need a full recovery
> * if this was recently any drive of the array
> */
> if (rdev->saved_raid_disk < 0)
> conf->fullsync = 1;
> - WRITE_ONCE(p->rdev, rdev);
> break;
> }
> if (test_bit(WantReplacement, &p->rdev->flags) &&
> @@ -1810,13 +1842,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>
> if (err && repl_slot >= 0) {
> /* Add this device as a replacement */
> - p = conf->mirrors + repl_slot;
> clear_bit(In_sync, &rdev->flags);
> set_bit(Replacement, &rdev->flags);
> - rdev->raid_disk = repl_slot;
> + raid1_add_conf(conf, rdev, repl_slot);
> err = 0;
> conf->fullsync = 1;
> - WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
> }
>
> print_conf(conf);
> @@ -1833,27 +1863,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
> if (unlikely(number >= conf->raid_disks))
> goto abort;
>
> - if (rdev != p->rdev)
> - p = conf->mirrors + conf->raid_disks + number;
> + if (rdev != p->rdev) {
> + number += conf->raid_disks;
> + p = conf->mirrors + number;
> + }
>
> print_conf(conf);
> if (rdev == p->rdev) {
> - if (test_bit(In_sync, &rdev->flags) ||
> - atomic_read(&rdev->nr_pending)) {
> - err = -EBUSY;
> - goto abort;
> - }
> - /* Only remove non-faulty devices if recovery
> - * is not possible.
> - */
> - if (!test_bit(Faulty, &rdev->flags) &&
> - mddev->recovery_disabled != conf->recovery_disabled &&
> - mddev->degraded < conf->raid_disks) {
> + if (!raid1_remove_conf(conf, number)) {
> err = -EBUSY;
> goto abort;
> }
> - WRITE_ONCE(p->rdev, NULL);
> - if (conf->mirrors[conf->raid_disks + number].rdev) {
> +
> + if (number < conf->raid_disks &&
> + conf->mirrors[conf->raid_disks + number].rdev) {
> /* We just removed a device that is being replaced.
> * Move down the replacement. We drain all IO before
> * doing this to avoid confusion.
> @@ -3000,15 +3023,10 @@ static struct r1conf *setup_conf(struct mddev *mddev)
> || disk_idx < 0)
> continue;
> if (test_bit(Replacement, &rdev->flags))
> - disk = conf->mirrors + mddev->raid_disks + disk_idx;
> - else
> - disk = conf->mirrors + disk_idx;
> + disk_idx += mddev->raid_disks;
>
> - if (disk->rdev)
> + if (!raid1_add_conf(conf, rdev, disk_idx))
> goto abort;
> - disk->rdev = rdev;
> - disk->head_position = 0;
> - disk->seq_start = MaxSector;
> }
> conf->raid_disks = mddev->raid_disks;
> conf->mddev = mddev;
>