From: Yu Kuai <[email protected]>
For raid1, each read will iterate all the rdevs from conf and check if
any rdev is non-rotational, then choose rdev with minimal IO inflight
if so, or rdev with closest distance otherwise.
Disk nonrot info can be changed through sysfs entry:
/sys/block/[disk_name]/queue/rotational
However, consider that this should only be used for testing, and user
really shouldn't do this in real life. Record the number of non-rotational
disks in conf, to avoid checking each rdev in IO fast path and simplify
read_balance() a little bit.
Co-developed-by: Paul Luse <[email protected]>
Signed-off-by: Paul Luse <[email protected]>
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md.h | 1 +
drivers/md/raid1.c | 17 ++++++++++-------
drivers/md/raid1.h | 1 +
3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a49ab04ab707..b2076a165c10 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -207,6 +207,7 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
+ Nonrot, /* non-rotational device (SSD) */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6ec9998f6257..de6ea87d4d24 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -599,7 +599,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
int sectors;
int best_good_sectors;
int best_disk, best_dist_disk, best_pending_disk;
- int has_nonrot_disk;
int disk;
sector_t best_dist;
unsigned int min_pending;
@@ -620,7 +619,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
- has_nonrot_disk = 0;
choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
@@ -637,7 +635,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
sector_t first_bad;
int bad_sectors;
unsigned int pending;
- bool nonrot;
rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED
@@ -703,8 +700,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
/* At least two disks to choose from so failfast is OK */
set_bit(R1BIO_FailFast, &r1_bio->state);
- nonrot = bdev_nonrot(rdev->bdev);
- has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first) {
@@ -731,7 +726,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/
- if (nonrot && opt_iosize > 0 &&
+ if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 &&
mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
@@ -763,7 +758,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* mixed ratation/non-rotational disks depending on workload.
*/
if (best_disk == -1) {
- if (has_nonrot_disk || min_pending == 0)
+ if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
best_disk = best_pending_disk;
else
best_disk = best_dist_disk;
@@ -1768,6 +1763,11 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
if (info->rdev)
return false;
+ if (bdev_nonrot(rdev->bdev)) {
+ set_bit(Nonrot, &rdev->flags);
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+ }
+
rdev->raid_disk = disk;
info->head_position = 0;
info->seq_start = MaxSector;
@@ -1791,6 +1791,9 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk)
rdev->mddev->degraded < conf->raid_disks)
return false;
+ if (test_and_clear_bit(Nonrot, &rdev->flags))
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
WRITE_ONCE(info->rdev, NULL);
return true;
}
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14d4211a123a..5300cbaa58a4 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements.
*/
int raid_disks;
+ int nonrot_disks;
spinlock_t device_lock;
--
2.39.2
Dear Yu,
Thank you for your patch.
Am 29.02.24 um 10:57 schrieb Yu Kuai:
> From: Yu Kuai <[email protected]>
>
> For raid1, each read will iterate all the rdevs from conf and check if
> any rdev is non-rotational, then choose rdev with minimal IO inflight
> if so, or rdev with closest distance otherwise.
>
> Disk nonrot info can be changed through sysfs entry:
>
> /sys/block/[disk_name]/queue/rotational
>
> However, consider that this should only be used for testing, and user
> really shouldn't do this in real life. Record the number of non-rotational
> disks in conf, to avoid checking each rdev in IO fast path and simplify
The comma is not needed.
> read_balance() a little bit.
Just to make sure, I understood correctly. Changing
`/sys/block/[disk_name]/queue/rotational` will now not be considered
anymore, right?
For the summary, maybe you could also say “cache”. Maybe:
Cache attribute rotational while adding/removing rdevs to conf
> Co-developed-by: Paul Luse <[email protected]>
> Signed-off-by: Paul Luse <[email protected]>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md.h | 1 +
> drivers/md/raid1.c | 17 ++++++++++-------
> drivers/md/raid1.h | 1 +
> 3 files changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index a49ab04ab707..b2076a165c10 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -207,6 +207,7 @@ enum flag_bits {
> * check if there is collision between raid1
> * serial bios.
> */
> + Nonrot, /* non-rotational device (SSD) */
> };
>
> static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 6ec9998f6257..de6ea87d4d24 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -599,7 +599,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> int sectors;
> int best_good_sectors;
> int best_disk, best_dist_disk, best_pending_disk;
> - int has_nonrot_disk;
> int disk;
> sector_t best_dist;
> unsigned int min_pending;
> @@ -620,7 +619,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> best_pending_disk = -1;
> min_pending = UINT_MAX;
> best_good_sectors = 0;
> - has_nonrot_disk = 0;
> choose_next_idle = 0;
> clear_bit(R1BIO_FailFast, &r1_bio->state);
>
> @@ -637,7 +635,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> sector_t first_bad;
> int bad_sectors;
> unsigned int pending;
> - bool nonrot;
>
> rdev = conf->mirrors[disk].rdev;
> if (r1_bio->bios[disk] == IO_BLOCKED
> @@ -703,8 +700,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> /* At least two disks to choose from so failfast is OK */
> set_bit(R1BIO_FailFast, &r1_bio->state);
>
> - nonrot = bdev_nonrot(rdev->bdev);
> - has_nonrot_disk |= nonrot;
> pending = atomic_read(&rdev->nr_pending);
> dist = abs(this_sector - conf->mirrors[disk].head_position);
> if (choose_first) {
> @@ -731,7 +726,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> * small, but not a big deal since when the second disk
> * starts IO, the first disk is likely still busy.
> */
> - if (nonrot && opt_iosize > 0 &&
> + if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 &&
> mirror->seq_start != MaxSector &&
> mirror->next_seq_sect > opt_iosize &&
> mirror->next_seq_sect - opt_iosize >=
> @@ -763,7 +758,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
> * mixed ratation/non-rotational disks depending on workload.
> */
> if (best_disk == -1) {
> - if (has_nonrot_disk || min_pending == 0)
> + if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
> best_disk = best_pending_disk;
> else
> best_disk = best_dist_disk;
> @@ -1768,6 +1763,11 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
> if (info->rdev)
> return false;
>
> + if (bdev_nonrot(rdev->bdev)) {
> + set_bit(Nonrot, &rdev->flags);
> + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
> + }
> +
> rdev->raid_disk = disk;
> info->head_position = 0;
> info->seq_start = MaxSector;
> @@ -1791,6 +1791,9 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk)
> rdev->mddev->degraded < conf->raid_disks)
> return false;
>
> + if (test_and_clear_bit(Nonrot, &rdev->flags))
> + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
> +
> WRITE_ONCE(info->rdev, NULL);
> return true;
> }
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index 14d4211a123a..5300cbaa58a4 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -71,6 +71,7 @@ struct r1conf {
> * allow for replacements.
> */
> int raid_disks;
> + int nonrot_disks;
>
> spinlock_t device_lock;
>
As you meant “fastpath” in the commit message, if I remember correctly,
this does not improve the performance in benchmarks, right?
Kind regards,
Paul
Hi,
在 2024/03/01 0:37, Paul Menzel 写道:
> Dear Yu,
>
>
> Thank you for your patch.
>
>
> Am 29.02.24 um 10:57 schrieb Yu Kuai:
>> From: Yu Kuai <[email protected]>
>>
>> For raid1, each read will iterate all the rdevs from conf and check if
>> any rdev is non-rotational, then choose rdev with minimal IO inflight
>> if so, or rdev with closest distance otherwise.
>>
>> Disk nonrot info can be changed through sysfs entry:
>>
>> /sys/block/[disk_name]/queue/rotational
>>
>> However, consider that this should only be used for testing, and user
>> really shouldn't do this in real life. Record the number of
>> non-rotational
>> disks in conf, to avoid checking each rdev in IO fast path and simplify
>
> The comma is not needed.
>
>> read_balance() a little bit.
>
> Just to make sure, I understood correctly. Changing
> `/sys/block/[disk_name]/queue/rotational` will now not be considered
> anymore, right?
Yes, and I think this will case performance to be worse in real life.
>
> For the summary, maybe you could also say “cache”. Maybe:
>
> Cache attribute rotational while adding/removing rdevs to conf
>
>> Co-developed-by: Paul Luse <[email protected]>
>> Signed-off-by: Paul Luse <[email protected]>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> drivers/md/md.h | 1 +
>> drivers/md/raid1.c | 17 ++++++++++-------
>> drivers/md/raid1.h | 1 +
>> 3 files changed, 12 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index a49ab04ab707..b2076a165c10 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -207,6 +207,7 @@ enum flag_bits {
>> * check if there is collision between raid1
>> * serial bios.
>> */
>> + Nonrot, /* non-rotational device (SSD) */
>> };
>> static inline int is_badblock(struct md_rdev *rdev, sector_t s, int
>> sectors,
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 6ec9998f6257..de6ea87d4d24 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -599,7 +599,6 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> int sectors;
>> int best_good_sectors;
>> int best_disk, best_dist_disk, best_pending_disk;
>> - int has_nonrot_disk;
>> int disk;
>> sector_t best_dist;
>> unsigned int min_pending;
>> @@ -620,7 +619,6 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> best_pending_disk = -1;
>> min_pending = UINT_MAX;
>> best_good_sectors = 0;
>> - has_nonrot_disk = 0;
>> choose_next_idle = 0;
>> clear_bit(R1BIO_FailFast, &r1_bio->state);
>> @@ -637,7 +635,6 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> sector_t first_bad;
>> int bad_sectors;
>> unsigned int pending;
>> - bool nonrot;
>> rdev = conf->mirrors[disk].rdev;
>> if (r1_bio->bios[disk] == IO_BLOCKED
>> @@ -703,8 +700,6 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> /* At least two disks to choose from so failfast is OK */
>> set_bit(R1BIO_FailFast, &r1_bio->state);
>> - nonrot = bdev_nonrot(rdev->bdev);
>> - has_nonrot_disk |= nonrot;
>> pending = atomic_read(&rdev->nr_pending);
>> dist = abs(this_sector - conf->mirrors[disk].head_position);
>> if (choose_first) {
>> @@ -731,7 +726,7 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> * small, but not a big deal since when the second disk
>> * starts IO, the first disk is likely still busy.
>> */
>> - if (nonrot && opt_iosize > 0 &&
>> + if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 &&
>> mirror->seq_start != MaxSector &&
>> mirror->next_seq_sect > opt_iosize &&
>> mirror->next_seq_sect - opt_iosize >=
>> @@ -763,7 +758,7 @@ static int read_balance(struct r1conf *conf,
>> struct r1bio *r1_bio, int *max_sect
>> * mixed ratation/non-rotational disks depending on workload.
>> */
>> if (best_disk == -1) {
>> - if (has_nonrot_disk || min_pending == 0)
>> + if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
>> best_disk = best_pending_disk;
>> else
>> best_disk = best_dist_disk;
>> @@ -1768,6 +1763,11 @@ static bool raid1_add_conf(struct r1conf *conf,
>> struct md_rdev *rdev, int disk,
>> if (info->rdev)
>> return false;
>> + if (bdev_nonrot(rdev->bdev)) {
>> + set_bit(Nonrot, &rdev->flags);
>> + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
>> + }
>> +
>> rdev->raid_disk = disk;
>> info->head_position = 0;
>> info->seq_start = MaxSector;
>> @@ -1791,6 +1791,9 @@ static bool raid1_remove_conf(struct r1conf
>> *conf, int disk)
>> rdev->mddev->degraded < conf->raid_disks)
>> return false;
>> + if (test_and_clear_bit(Nonrot, &rdev->flags))
>> + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
>> +
>> WRITE_ONCE(info->rdev, NULL);
>> return true;
>> }
>> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
>> index 14d4211a123a..5300cbaa58a4 100644
>> --- a/drivers/md/raid1.h
>> +++ b/drivers/md/raid1.h
>> @@ -71,6 +71,7 @@ struct r1conf {
>> * allow for replacements.
>> */
>> int raid_disks;
>> + int nonrot_disks;
>> spinlock_t device_lock;
>
> As you meant “fastpath” in the commit message, if I remember correctly,
> this does not improve the performance in benchmarks, right?
Yest, this just safe some memory load command, this is to little to
affect performance benchmarks. Main ideal here is make read_balance()
cleaner.
Thanks,
Kuai
>
>
> Kind regards,
>
> Paul
> .
>