From: Yu Kuai <[email protected]>
This patchset do following things:
- make io accounting for all raid levels consistent, patch 1, 3-5;
- enable io accounting for all raid levels, patch 6-8;
- hold 'active_io' before io start to dispatch, and release 'active_io'
when io is done, make mddev_suspend() will wait for io to be done, patch 2
Yu Kuai (8):
md: move initialization and destruction of 'io_acct_set' to md.c
md: also clone new io if io accounting is disabled
raid5: fix missing io accounting in raid5_align_endio()
md/raid1: switch to use md_account_bio() for io accounting
md/raid10: switch to use md_account_bio() for io accounting
md/md-multipath: enable io accounting
md/md-linear: enable io accounting
md/md-faulty: enable io accounting
drivers/md/md-faulty.c | 2 +
drivers/md/md-linear.c | 1 +
drivers/md/md-multipath.c | 1 +
drivers/md/md.c | 78 +++++++++++++++++++--------------------
drivers/md/md.h | 6 +--
drivers/md/raid0.c | 16 +-------
drivers/md/raid1.c | 14 +++----
drivers/md/raid1.h | 1 -
drivers/md/raid10.c | 20 +++++-----
drivers/md/raid10.h | 1 -
drivers/md/raid5.c | 70 ++++++++++-------------------------
11 files changed, 79 insertions(+), 131 deletions(-)
--
2.39.2
From: Yu Kuai <[email protected]>
use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-multipath.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 92c45be203d7..d22276870283 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
&& md_flush_request(mddev, bio))
return true;
+ md_account_bio(mddev, &bio);
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
--
2.39.2
From: Yu Kuai <[email protected]>
Currently, 'active_io' is grabed before make_reqeust() is called, and
it's dropped immediately make_reqeust() returns. Hence 'active_io'
actually means io is dispatching, not io is inflight.
For raid0 and raid456 that io accounting is enabled, 'active_io' will
also be grabed when bio is cloned for io accounting, and this 'active_io'
is dropped until io is done.
Always clone new bio so that 'active_io' will mean that io is inflight,
raid1 and raid10 will switch to use this method in later patches. Once
these are implemented, it can be cleaned up that 'active_io' is grabed
twice for one io.
Now that bio will be cloned even if io accounting is disabled, also
rename related structure from '*_acct_*' to '*_clone_*'.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md.c | 61 ++++++++++++++++++++++++----------------------
drivers/md/md.h | 4 +--
drivers/md/raid5.c | 18 +++++++-------
3 files changed, 43 insertions(+), 40 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42347289195a..5ad8e7f3aebd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2314,7 +2314,7 @@ int md_integrity_register(struct mddev *mddev)
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
(mddev->level != 1 && mddev->level != 10 &&
- bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
+ bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
/*
* No need to handle the failure of bioset_integrity_create,
* because the function is called by md_run() -> pers->run(),
@@ -5886,9 +5886,9 @@ int md_run(struct mddev *mddev)
goto exit_bio_set;
}
- if (!bioset_initialized(&mddev->io_acct_set)) {
- err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
- offsetof(struct md_io_acct, bio_clone), 0);
+ if (!bioset_initialized(&mddev->io_clone_set)) {
+ err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_clone, bio_clone), 0);
if (err)
goto exit_sync_set;
}
@@ -6070,7 +6070,7 @@ int md_run(struct mddev *mddev)
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
- bioset_exit(&mddev->io_acct_set);
+ bioset_exit(&mddev->io_clone_set);
exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
@@ -6295,7 +6295,7 @@ static void __md_stop(struct mddev *mddev)
percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
- bioset_exit(&mddev->io_acct_set);
+ bioset_exit(&mddev->io_clone_set);
}
void md_stop(struct mddev *mddev)
@@ -8661,45 +8661,48 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
-static void md_end_io_acct(struct bio *bio)
+static void md_end_clone_io(struct bio *bio)
{
- struct md_io_acct *md_io_acct = bio->bi_private;
- struct bio *orig_bio = md_io_acct->orig_bio;
- struct mddev *mddev = md_io_acct->mddev;
+ struct md_io_clone *md_io_clone = bio->bi_private;
+ struct bio *orig_bio = md_io_clone->orig_bio;
+ struct mddev *mddev = md_io_clone->mddev;
orig_bio->bi_status = bio->bi_status;
- bio_end_io_acct(orig_bio, md_io_acct->start_time);
+ if (md_io_clone->start_time)
+ bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
bio_put(bio);
bio_endio(orig_bio);
-
percpu_ref_put(&mddev->active_io);
}
+static void md_clone_bio(struct mddev *mddev, struct bio **bio)
+{
+ struct block_device *bdev = (*bio)->bi_bdev;
+ struct md_io_clone *md_io_clone;
+ struct bio *clone =
+ bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
+
+ md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
+ md_io_clone->orig_bio = *bio;
+ md_io_clone->mddev = mddev;
+ if (blk_queue_io_stat(bdev->bd_disk->queue))
+ md_io_clone->start_time = bio_start_io_acct(*bio);
+
+ clone->bi_end_io = md_end_clone_io;
+ clone->bi_private = md_io_clone;
+ *bio = clone;
+}
+
/*
* Used by personalities that don't already clone the bio and thus can't
* easily add the timestamp to their extended bio structure.
*/
void md_account_bio(struct mddev *mddev, struct bio **bio)
{
- struct block_device *bdev = (*bio)->bi_bdev;
- struct md_io_acct *md_io_acct;
- struct bio *clone;
-
- if (!blk_queue_io_stat(bdev->bd_disk->queue))
- return;
-
percpu_ref_get(&mddev->active_io);
-
- clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
- md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
- md_io_acct->orig_bio = *bio;
- md_io_acct->start_time = bio_start_io_acct(*bio);
- md_io_acct->mddev = mddev;
-
- clone->bi_end_io = md_end_io_acct;
- clone->bi_private = md_io_acct;
- *bio = clone;
+ md_clone_bio(mddev, bio);
}
EXPORT_SYMBOL_GPL(md_account_bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 11299d94b239..892a598a5029 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -510,7 +510,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
- struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
+ struct bio_set io_clone_set;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
@@ -738,7 +738,7 @@ struct md_thread {
void *private;
};
-struct md_io_acct {
+struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 29cf5455d7a5..cef0b400b2ee 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/
static void raid5_align_endio(struct bio *bi)
{
- struct md_io_acct *md_io_acct = bi->bi_private;
- struct bio *raid_bi = md_io_acct->orig_bio;
+ struct md_io_clone *md_io_clone = bi->bi_private;
+ struct bio *raid_bi = md_io_clone->orig_bio;
struct mddev *mddev;
struct r5conf *conf;
struct md_rdev *rdev;
blk_status_t error = bi->bi_status;
- unsigned long start_time = md_io_acct->start_time;
+ unsigned long start_time = md_io_clone->start_time;
bio_put(bi);
@@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
- struct md_io_acct *md_io_acct;
+ struct md_io_clone *md_io_clone;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
}
align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
- &mddev->io_acct_set);
- md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
+ &mddev->io_clone_set);
+ md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
raid_bio->bi_next = (void *)rdev;
if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
- md_io_acct->start_time = bio_start_io_acct(raid_bio);
- md_io_acct->orig_bio = raid_bio;
+ md_io_clone->start_time = bio_start_io_acct(raid_bio);
+ md_io_clone->orig_bio = raid_bio;
align_bio->bi_end_io = raid5_align_endio;
- align_bio->bi_private = md_io_acct;
+ align_bio->bi_private = md_io_clone;
align_bio->bi_iter.bi_sector = sector;
/* No reshape active, so we can trust rdev->data_offset */
--
2.39.2
From: Yu Kuai <[email protected]>
use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-linear.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 4eb72b9dd933..71ac99646827 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
+ md_account_bio(mddev, &bio);
bio_set_dev(bio, tmp_dev->rdev->bdev);
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
start_sector + data_offset;
--
2.39.2
From: Yu Kuai <[email protected]>
use md_account_bio() to enable io accounting, also make sure
mddev_suspend() will wait for all io to be done.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md-faulty.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index 50ad818978a4..a039e8e20f55 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
failit = 1;
}
}
+
+ md_account_bio(mddev, &bio);
if (failit) {
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
&mddev->bio_set);
--
2.39.2
From: Yu Kuai <[email protected]>
'io_acct_set' is only used for raid0 and raid456, prepare to use it for
raid1 and raid10, so that io accounting from different levels can be
consistent.
By the way, follow up patches will also use this io clone mechanism to
make sure 'active_io' represents in flight io, not io that is dispatching,
so that mddev_suspend will wait for io to be done as desgined.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/md.c | 27 ++++++++++-----------------
drivers/md/md.h | 2 --
drivers/md/raid0.c | 16 ++--------------
drivers/md/raid5.c | 41 +++++++++++------------------------------
4 files changed, 23 insertions(+), 63 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8d62f85d2ab0..42347289195a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
goto exit_bio_set;
}
+ if (!bioset_initialized(&mddev->io_acct_set)) {
+ err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_acct, bio_clone), 0);
+ if (err)
+ goto exit_sync_set;
+ }
+
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) {
@@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
+ bioset_exit(&mddev->io_acct_set);
+exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
@@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
+ bioset_exit(&mddev->io_acct_set);
}
void md_stop(struct mddev *mddev)
@@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
-int acct_bioset_init(struct mddev *mddev)
-{
- int err = 0;
-
- if (!bioset_initialized(&mddev->io_acct_set))
- err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
- offsetof(struct md_io_acct, bio_clone), 0);
- return err;
-}
-EXPORT_SYMBOL_GPL(acct_bioset_init);
-
-void acct_bioset_exit(struct mddev *mddev)
-{
- bioset_exit(&mddev->io_acct_set);
-}
-EXPORT_SYMBOL_GPL(acct_bioset_exit);
-
static void md_end_io_acct(struct bio *bio)
{
struct md_io_acct *md_io_acct = bio->bi_private;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7cab9c7c45b8..11299d94b239 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
-int acct_bioset_init(struct mddev *mddev);
-void acct_bioset_exit(struct mddev *mddev);
void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f8ee9a95e25d..38d9209cada1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
struct r0conf *conf = priv;
free_conf(mddev, conf);
- acct_bioset_exit(mddev);
}
static int raid0_run(struct mddev *mddev)
@@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
- if (acct_bioset_init(mddev)) {
- pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
- return -ENOMEM;
- }
-
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
if (ret < 0)
- goto exit_acct_set;
+ return ret;
mddev->private = conf;
}
conf = mddev->private;
@@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
- goto free;
+ free_conf(mddev, conf);
return ret;
-
-free:
- free_conf(mddev, conf);
-exit_acct_set:
- acct_bioset_exit(mddev);
- return ret;
}
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8bc74e16811..29cf5455d7a5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
- int i, ret = 0;
+ int i;
long long min_offset_diff = 0;
int first = 1;
- if (acct_bioset_init(mddev)) {
- pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
+ if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
- }
-
- if (mddev_init_writes_pending(mddev) < 0) {
- ret = -ENOMEM;
- goto exit_acct_set;
- }
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
if (mddev->reshape_position != MaxSector) {
@@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
if (journal_dev) {
pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
if (mddev->new_level != mddev->level) {
pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
@@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
@@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
else if (mddev->ro == 0) {
pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
} else if (mddev->reshape_backwards
? (here_new * chunk_sectors + min_offset_diff <=
@@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
/* Reading from the same stripe as writing to - bad */
pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
mdname(mddev));
- ret = -EINVAL;
- goto exit_acct_set;
+ return -EINVAL;
}
pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */
@@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
else
conf = mddev->private;
- if (IS_ERR(conf)) {
- ret = PTR_ERR(conf);
- goto exit_acct_set;
- }
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) {
@@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
- ret = -EIO;
-exit_acct_set:
- acct_bioset_exit(mddev);
- return ret;
+ return -EIO;
}
static void raid5_free(struct mddev *mddev, void *priv)
@@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
struct r5conf *conf = priv;
free_conf(conf);
- acct_bioset_exit(mddev);
mddev->to_remove = &raid5_attrs_group;
}
--
2.39.2
From: Yu Kuai <[email protected]>
Make sure that 'active_io' will represent inflight io instead of io that
is dispatching, and io accounting from all levels will be consistent.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid10.c | 20 +++++++++-----------
drivers/md/raid10.h | 1 -
2 files changed, 9 insertions(+), 12 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 79067769e44b..69f6d7b1e600 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
- if (r10_bio->start_time)
- bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
}
static void raid10_read_request(struct mddev *mddev, struct bio *bio,
- struct r10bio *r10_bio)
+ struct r10bio *r10_bio, bool io_accounting)
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- if (!r10_bio->start_time &&
- blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r10_bio->start_time = bio_start_io_acct(bio);
+ if (io_accounting) {
+ md_account_bio(mddev, &bio);
+ r10_bio->master_bio = bio;
+ }
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
@@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio;
}
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r10_bio->start_time = bio_start_io_acct(bio);
+ md_account_bio(mddev, &bio);
+ r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
- r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
if (bio_data_dir(bio) == READ)
- raid10_read_request(mddev, bio, r10_bio);
+ raid10_read_request(mddev, bio, r10_bio, true);
else
raid10_write_request(mddev, bio, r10_bio);
}
@@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
- raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 63e48b11b552..2e75e88d0802 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,6 @@ struct r10bio {
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
- unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
--
2.39.2
From: Yu Kuai <[email protected]>
Two problems can be fixed this way:
1) 'active_io' will represent inflight io instead of io that is
dispatching.
2) If io accounting is enabled or disabled while io is still inflight,
bio_start_io_acct() and bio_end_io_acct() is not balanced and io
inflight counter will be leaked.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid1.c | 14 ++++++--------
drivers/md/raid1.h | 1 -
2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index dd25832eb045..06fa1580501f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- bio_end_io_acct(bio, r1_bio->start_time);
bio_endio(bio);
}
@@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
r1_bio->read_disk = rdisk;
-
- if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r1_bio->start_time = bio_start_io_acct(bio);
-
+ if (!r1bio_existed) {
+ md_account_bio(mddev, &bio);
+ r1_bio->master_bio = bio;
+ }
read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
&mddev->bio_set);
@@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sectors = max_sectors;
}
- if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
- r1_bio->start_time = bio_start_io_acct(bio);
+ md_account_bio(mddev, &bio);
+ r1_bio->master_bio = bio;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 468f189da7a0..14d4211a123a 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,6 @@ struct r1bio {
sector_t sector;
int sectors;
unsigned long state;
- unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
--
2.39.2
From: Yu Kuai <[email protected]>
Io will only be accounted as done from raid5_align_endio() if the io
succeed, and io inflight counter will be leaked if such io failed.
Fix this problem by switching to use md_account_bio() for io accounting.
Signed-off-by: Yu Kuai <[email protected]>
---
drivers/md/raid5.c | 29 ++++++++---------------------
1 file changed, 8 insertions(+), 21 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cef0b400b2ee..4cdb35e54251 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/
static void raid5_align_endio(struct bio *bi)
{
- struct md_io_clone *md_io_clone = bi->bi_private;
- struct bio *raid_bi = md_io_clone->orig_bio;
- struct mddev *mddev;
- struct r5conf *conf;
- struct md_rdev *rdev;
+ struct bio *raid_bi = bi->bi_private;
+ struct md_rdev *rdev = (void *)raid_bi->bi_next;
+ struct mddev *mddev = rdev->mddev;
+ struct r5conf *conf = mddev->private;
blk_status_t error = bi->bi_status;
- unsigned long start_time = md_io_clone->start_time;
bio_put(bi);
-
- rdev = (void*)raid_bi->bi_next;
raid_bi->bi_next = NULL;
- mddev = rdev->mddev;
- conf = mddev->private;
-
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
- bio_end_io_acct(raid_bi, start_time);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
- struct md_io_clone *md_io_clone;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
- align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
- &mddev->io_clone_set);
- md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
+ md_account_bio(mddev, &raid_bio);
raid_bio->bi_next = (void *)rdev;
- if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
- md_io_clone->start_time = bio_start_io_acct(raid_bio);
- md_io_clone->orig_bio = raid_bio;
+ align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
+ &mddev->bio_set);
align_bio->bi_end_io = raid5_align_endio;
- align_bio->bi_private = md_io_clone;
+ align_bio->bi_private = raid_bio;
align_bio->bi_iter.bi_sector = sector;
/* No reshape active, so we can trust rdev->data_offset */
--
2.39.2
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> 'io_acct_set' is only used for raid0 and raid456, prepare to use it for
> raid1 and raid10, so that io accounting from different levels can be
> consistent.
>
> By the way, follow up patches will also use this io clone mechanism to
> make sure 'active_io' represents in flight io, not io that is dispatching,
> so that mddev_suspend will wait for io to be done as desgined.
Hi Kuai
typo error: s/desgined/designed/g
Before this patch the personality uses ->quiesce method to wait until
all inflight ios come back. But I like this solution. It makes the
codes simpler. Not sure if it can cause problems because it changes
the meaning of ->active_io. I'm doing regression tests to check.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md.c | 27 ++++++++++-----------------
> drivers/md/md.h | 2 --
> drivers/md/raid0.c | 16 ++--------------
> drivers/md/raid5.c | 41 +++++++++++------------------------------
> 4 files changed, 23 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 8d62f85d2ab0..42347289195a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
> goto exit_bio_set;
> }
>
> + if (!bioset_initialized(&mddev->io_acct_set)) {
> + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> + offsetof(struct md_io_acct, bio_clone), 0);
> + if (err)
> + goto exit_sync_set;
> + }
> +
> spin_lock(&pers_lock);
> pers = find_pers(mddev->level, mddev->clevel);
> if (!pers || !try_module_get(pers->owner)) {
> @@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
> module_put(pers->owner);
> md_bitmap_destroy(mddev);
> abort:
> + bioset_exit(&mddev->io_acct_set);
> +exit_sync_set:
> bioset_exit(&mddev->sync_set);
> exit_bio_set:
> bioset_exit(&mddev->bio_set);
> @@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
> percpu_ref_exit(&mddev->active_io);
> bioset_exit(&mddev->bio_set);
> bioset_exit(&mddev->sync_set);
> + bioset_exit(&mddev->io_acct_set);
> }
>
> void md_stop(struct mddev *mddev)
> @@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
> }
> EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> -int acct_bioset_init(struct mddev *mddev)
> -{
> - int err = 0;
> -
> - if (!bioset_initialized(&mddev->io_acct_set))
> - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> - offsetof(struct md_io_acct, bio_clone), 0);
> - return err;
> -}
> -EXPORT_SYMBOL_GPL(acct_bioset_init);
> -
> -void acct_bioset_exit(struct mddev *mddev)
> -{
> - bioset_exit(&mddev->io_acct_set);
> -}
> -EXPORT_SYMBOL_GPL(acct_bioset_exit);
> -
> static void md_end_io_acct(struct bio *bio)
> {
> struct md_io_acct *md_io_acct = bio->bi_private;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 7cab9c7c45b8..11299d94b239 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
> extern void md_finish_reshape(struct mddev *mddev);
> void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
> struct bio *bio, sector_t start, sector_t size);
> -int acct_bioset_init(struct mddev *mddev);
> -void acct_bioset_exit(struct mddev *mddev);
> void md_account_bio(struct mddev *mddev, struct bio **bio);
>
> extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index f8ee9a95e25d..38d9209cada1 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
> struct r0conf *conf = priv;
>
> free_conf(mddev, conf);
> - acct_bioset_exit(mddev);
> }
>
> static int raid0_run(struct mddev *mddev)
> @@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
> if (md_check_no_bitmap(mddev))
> return -EINVAL;
>
> - if (acct_bioset_init(mddev)) {
> - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
> - return -ENOMEM;
> - }
> -
> /* if private is not null, we are here after takeover */
> if (mddev->private == NULL) {
> ret = create_strip_zones(mddev, &conf);
> if (ret < 0)
> - goto exit_acct_set;
> + return ret;
> mddev->private = conf;
> }
> conf = mddev->private;
> @@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
>
> ret = md_integrity_register(mddev);
> if (ret)
> - goto free;
> + free_conf(mddev, conf);
>
> return ret;
> -
> -free:
> - free_conf(mddev, conf);
> -exit_acct_set:
> - acct_bioset_exit(mddev);
> - return ret;
> }
>
> static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index f8bc74e16811..29cf5455d7a5 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
> struct md_rdev *rdev;
> struct md_rdev *journal_dev = NULL;
> sector_t reshape_offset = 0;
> - int i, ret = 0;
> + int i;
> long long min_offset_diff = 0;
> int first = 1;
>
> - if (acct_bioset_init(mddev)) {
> - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
> + if (mddev_init_writes_pending(mddev) < 0)
> return -ENOMEM;
> - }
> -
> - if (mddev_init_writes_pending(mddev) < 0) {
> - ret = -ENOMEM;
> - goto exit_acct_set;
> - }
>
> if (mddev->recovery_cp != MaxSector)
> pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
> @@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
> (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
> pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
>
> if (mddev->reshape_position != MaxSector) {
> @@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
> if (journal_dev) {
> pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
>
> if (mddev->new_level != mddev->level) {
> pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
> old_disks = mddev->raid_disks - mddev->delta_disks;
> /* reshape_position must be on a new-stripe boundary, and one
> @@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
> if (sector_div(here_new, chunk_sectors * new_data_disks)) {
> pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
> reshape_offset = here_new * chunk_sectors;
> /* here_new is the stripe we will write to */
> @@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
> else if (mddev->ro == 0) {
> pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
> } else if (mddev->reshape_backwards
> ? (here_new * chunk_sectors + min_offset_diff <=
> @@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
> /* Reading from the same stripe as writing to - bad */
> pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
> mdname(mddev));
> - ret = -EINVAL;
> - goto exit_acct_set;
> + return -EINVAL;
> }
> pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
> /* OK, we should be able to continue; */
> @@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
> else
> conf = mddev->private;
>
> - if (IS_ERR(conf)) {
> - ret = PTR_ERR(conf);
> - goto exit_acct_set;
> - }
> + if (IS_ERR(conf))
> + return PTR_ERR(conf);
>
> if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
> if (!journal_dev) {
> @@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
> free_conf(conf);
> mddev->private = NULL;
> pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
> - ret = -EIO;
> -exit_acct_set:
> - acct_bioset_exit(mddev);
> - return ret;
> + return -EIO;
> }
>
> static void raid5_free(struct mddev *mddev, void *priv)
> @@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
> struct r5conf *conf = priv;
>
> free_conf(conf);
> - acct_bioset_exit(mddev);
> mddev->to_remove = &raid5_attrs_group;
> }
>
> --
> 2.39.2
>
The patch is good for me.
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Currently, 'active_io' is grabed before make_reqeust() is called, and
> it's dropped immediately make_reqeust() returns. Hence 'active_io'
> actually means io is dispatching, not io is inflight.
Hi Kuai
There are three typo errors in the commit message: s/grabed/grabbed/g
>
> For raid0 and raid456 that io accounting is enabled, 'active_io' will
> also be grabed when bio is cloned for io accounting, and this 'active_io'
> is dropped until io is done.
>
> Always clone new bio so that 'active_io' will mean that io is inflight,
> raid1 and raid10 will switch to use this method in later patches. Once
> these are implemented, it can be cleaned up that 'active_io' is grabed
> twice for one io.
>
> Now that bio will be cloned even if io accounting is disabled, also
> rename related structure from '*_acct_*' to '*_clone_*'.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md.c | 61 ++++++++++++++++++++++++----------------------
> drivers/md/md.h | 4 +--
> drivers/md/raid5.c | 18 +++++++-------
> 3 files changed, 43 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 42347289195a..5ad8e7f3aebd 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -2314,7 +2314,7 @@ int md_integrity_register(struct mddev *mddev)
> pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
> if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
> (mddev->level != 1 && mddev->level != 10 &&
> - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
> + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
> /*
> * No need to handle the failure of bioset_integrity_create,
> * because the function is called by md_run() -> pers->run(),
> @@ -5886,9 +5886,9 @@ int md_run(struct mddev *mddev)
> goto exit_bio_set;
> }
>
> - if (!bioset_initialized(&mddev->io_acct_set)) {
> - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
> - offsetof(struct md_io_acct, bio_clone), 0);
> + if (!bioset_initialized(&mddev->io_clone_set)) {
> + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
> + offsetof(struct md_io_clone, bio_clone), 0);
> if (err)
> goto exit_sync_set;
> }
> @@ -6070,7 +6070,7 @@ int md_run(struct mddev *mddev)
> module_put(pers->owner);
> md_bitmap_destroy(mddev);
> abort:
> - bioset_exit(&mddev->io_acct_set);
> + bioset_exit(&mddev->io_clone_set);
> exit_sync_set:
> bioset_exit(&mddev->sync_set);
> exit_bio_set:
> @@ -6295,7 +6295,7 @@ static void __md_stop(struct mddev *mddev)
> percpu_ref_exit(&mddev->active_io);
> bioset_exit(&mddev->bio_set);
> bioset_exit(&mddev->sync_set);
> - bioset_exit(&mddev->io_acct_set);
> + bioset_exit(&mddev->io_clone_set);
> }
>
> void md_stop(struct mddev *mddev)
> @@ -8661,45 +8661,48 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
> }
> EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>
> -static void md_end_io_acct(struct bio *bio)
> +static void md_end_clone_io(struct bio *bio)
> {
> - struct md_io_acct *md_io_acct = bio->bi_private;
> - struct bio *orig_bio = md_io_acct->orig_bio;
> - struct mddev *mddev = md_io_acct->mddev;
> + struct md_io_clone *md_io_clone = bio->bi_private;
> + struct bio *orig_bio = md_io_clone->orig_bio;
> + struct mddev *mddev = md_io_clone->mddev;
>
> orig_bio->bi_status = bio->bi_status;
>
> - bio_end_io_acct(orig_bio, md_io_acct->start_time);
> + if (md_io_clone->start_time)
> + bio_end_io_acct(orig_bio, md_io_clone->start_time);
> +
> bio_put(bio);
> bio_endio(orig_bio);
> -
> percpu_ref_put(&mddev->active_io);
> }
>
> +static void md_clone_bio(struct mddev *mddev, struct bio **bio)
> +{
> + struct block_device *bdev = (*bio)->bi_bdev;
> + struct md_io_clone *md_io_clone;
> + struct bio *clone =
> + bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
> +
> + md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
> + md_io_clone->orig_bio = *bio;
> + md_io_clone->mddev = mddev;
> + if (blk_queue_io_stat(bdev->bd_disk->queue))
> + md_io_clone->start_time = bio_start_io_acct(*bio);
> +
> + clone->bi_end_io = md_end_clone_io;
> + clone->bi_private = md_io_clone;
> + *bio = clone;
> +}
> +
> /*
> * Used by personalities that don't already clone the bio and thus can't
> * easily add the timestamp to their extended bio structure.
> */
> void md_account_bio(struct mddev *mddev, struct bio **bio)
> {
> - struct block_device *bdev = (*bio)->bi_bdev;
> - struct md_io_acct *md_io_acct;
> - struct bio *clone;
> -
> - if (!blk_queue_io_stat(bdev->bd_disk->queue))
> - return;
> -
> percpu_ref_get(&mddev->active_io);
> -
> - clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
> - md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
> - md_io_acct->orig_bio = *bio;
> - md_io_acct->start_time = bio_start_io_acct(*bio);
> - md_io_acct->mddev = mddev;
> -
> - clone->bi_end_io = md_end_io_acct;
> - clone->bi_private = md_io_acct;
> - *bio = clone;
> + md_clone_bio(mddev, bio);
> }
> EXPORT_SYMBOL_GPL(md_account_bio);
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 11299d94b239..892a598a5029 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -510,7 +510,7 @@ struct mddev {
> struct bio_set sync_set; /* for sync operations like
> * metadata and bitmap writes
> */
> - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
> + struct bio_set io_clone_set;
>
> /* Generic flush handling.
> * The last to finish preflush schedules a worker to submit
> @@ -738,7 +738,7 @@ struct md_thread {
> void *private;
> };
>
> -struct md_io_acct {
> +struct md_io_clone {
> struct mddev *mddev;
> struct bio *orig_bio;
> unsigned long start_time;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 29cf5455d7a5..cef0b400b2ee 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,13 +5468,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
> */
> static void raid5_align_endio(struct bio *bi)
> {
> - struct md_io_acct *md_io_acct = bi->bi_private;
> - struct bio *raid_bi = md_io_acct->orig_bio;
> + struct md_io_clone *md_io_clone = bi->bi_private;
> + struct bio *raid_bi = md_io_clone->orig_bio;
> struct mddev *mddev;
> struct r5conf *conf;
> struct md_rdev *rdev;
> blk_status_t error = bi->bi_status;
> - unsigned long start_time = md_io_acct->start_time;
> + unsigned long start_time = md_io_clone->start_time;
>
> bio_put(bi);
>
> @@ -5506,7 +5506,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> struct md_rdev *rdev;
> sector_t sector, end_sector, first_bad;
> int bad_sectors, dd_idx;
> - struct md_io_acct *md_io_acct;
> + struct md_io_clone *md_io_clone;
> bool did_inc;
>
> if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5544,15 +5544,15 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> }
>
> align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> - &mddev->io_acct_set);
> - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
> + &mddev->io_clone_set);
> + md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
> raid_bio->bi_next = (void *)rdev;
> if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> - md_io_acct->start_time = bio_start_io_acct(raid_bio);
> - md_io_acct->orig_bio = raid_bio;
> + md_io_clone->start_time = bio_start_io_acct(raid_bio);
> + md_io_clone->orig_bio = raid_bio;
>
> align_bio->bi_end_io = raid5_align_endio;
> - align_bio->bi_private = md_io_acct;
> + align_bio->bi_private = md_io_clone;
> align_bio->bi_iter.bi_sector = sector;
>
> /* No reshape active, so we can trust rdev->data_offset */
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Io will only be accounted as done from raid5_align_endio() if the io
> succeed, and io inflight counter will be leaked if such io failed.
>
> Fix this problem by switching to use md_account_bio() for io accounting.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid5.c | 29 ++++++++---------------------
> 1 file changed, 8 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cef0b400b2ee..4cdb35e54251 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
> */
> static void raid5_align_endio(struct bio *bi)
> {
> - struct md_io_clone *md_io_clone = bi->bi_private;
> - struct bio *raid_bi = md_io_clone->orig_bio;
> - struct mddev *mddev;
> - struct r5conf *conf;
> - struct md_rdev *rdev;
> + struct bio *raid_bi = bi->bi_private;
> + struct md_rdev *rdev = (void *)raid_bi->bi_next;
> + struct mddev *mddev = rdev->mddev;
> + struct r5conf *conf = mddev->private;
> blk_status_t error = bi->bi_status;
> - unsigned long start_time = md_io_clone->start_time;
>
> bio_put(bi);
> -
> - rdev = (void*)raid_bi->bi_next;
> raid_bi->bi_next = NULL;
> - mddev = rdev->mddev;
> - conf = mddev->private;
> -
> rdev_dec_pending(rdev, conf->mddev);
>
> if (!error) {
> - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
> - bio_end_io_acct(raid_bi, start_time);
> bio_endio(raid_bi);
> if (atomic_dec_and_test(&conf->active_aligned_reads))
> wake_up(&conf->wait_for_quiescent);
> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> struct md_rdev *rdev;
> sector_t sector, end_sector, first_bad;
> int bad_sectors, dd_idx;
> - struct md_io_clone *md_io_clone;
> bool did_inc;
>
> if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> return 0;
> }
>
> - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> - &mddev->io_clone_set);
> - md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
> + md_account_bio(mddev, &raid_bio);
> raid_bio->bi_next = (void *)rdev;
> - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> - md_io_clone->start_time = bio_start_io_acct(raid_bio);
> - md_io_clone->orig_bio = raid_bio;
>
> + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> + &mddev->bio_set);
> align_bio->bi_end_io = raid5_align_endio;
> - align_bio->bi_private = md_io_clone;
> + align_bio->bi_private = raid_bio;
> align_bio->bi_iter.bi_sector = sector;
>
> /* No reshape active, so we can trust rdev->data_offset */
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md-multipath.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
> index 92c45be203d7..d22276870283 100644
> --- a/drivers/md/md-multipath.c
> +++ b/drivers/md/md-multipath.c
> @@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
> && md_flush_request(mddev, bio))
> return true;
>
> + md_account_bio(mddev, &bio);
> mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
>
> mp_bh->master_bio = bio;
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md-linear.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
> index 4eb72b9dd933..71ac99646827 100644
> --- a/drivers/md/md-linear.c
> +++ b/drivers/md/md-linear.c
> @@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
> bio = split;
> }
>
> + md_account_bio(mddev, &bio);
> bio_set_dev(bio, tmp_dev->rdev->bdev);
> bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
> start_sector + data_offset;
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Two problems can be fixed this way:
>
> 1) 'active_io' will represent inflight io instead of io that is
> dispatching.
>
> 2) If io accounting is enabled or disabled while io is still inflight,
> bio_start_io_acct() and bio_end_io_acct() is not balanced and io
> inflight counter will be leaked.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid1.c | 14 ++++++--------
> drivers/md/raid1.h | 1 -
> 2 files changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index dd25832eb045..06fa1580501f 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
> if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
> bio->bi_status = BLK_STS_IOERR;
>
> - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> - bio_end_io_acct(bio, r1_bio->start_time);
> bio_endio(bio);
> }
>
> @@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
> }
>
> r1_bio->read_disk = rdisk;
> -
> - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> - r1_bio->start_time = bio_start_io_acct(bio);
> -
> + if (!r1bio_existed) {
> + md_account_bio(mddev, &bio);
> + r1_bio->master_bio = bio;
> + }
> read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
> &mddev->bio_set);
>
> @@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> r1_bio->sectors = max_sectors;
> }
>
> - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> - r1_bio->start_time = bio_start_io_acct(bio);
> + md_account_bio(mddev, &bio);
> + r1_bio->master_bio = bio;
> atomic_set(&r1_bio->remaining, 1);
> atomic_set(&r1_bio->behind_remaining, 0);
>
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index 468f189da7a0..14d4211a123a 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -157,7 +157,6 @@ struct r1bio {
> sector_t sector;
> int sectors;
> unsigned long state;
> - unsigned long start_time;
> struct mddev *mddev;
> /*
> * original bio going to /dev/mdx
> --
> 2.39.2
>
Hi Kuai
After this patch, raid1 will have one more memory allocation in the
I/O path. Not sure if it can affect performance. Beside this, the
patch is good for me.
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> use md_account_bio() to enable io accounting, also make sure
> mddev_suspend() will wait for all io to be done.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/md-faulty.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
> index 50ad818978a4..a039e8e20f55 100644
> --- a/drivers/md/md-faulty.c
> +++ b/drivers/md/md-faulty.c
> @@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
> failit = 1;
> }
> }
> +
> + md_account_bio(mddev, &bio);
> if (failit) {
> struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
> &mddev->bio_set);
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> Make sure that 'active_io' will represent inflight io instead of io that
> is dispatching, and io accounting from all levels will be consistent.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid10.c | 20 +++++++++-----------
> drivers/md/raid10.h | 1 -
> 2 files changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 79067769e44b..69f6d7b1e600 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
> if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
> bio->bi_status = BLK_STS_IOERR;
>
> - if (r10_bio->start_time)
> - bio_end_io_acct(bio, r10_bio->start_time);
> bio_endio(bio);
> /*
> * Wake up any possible resync thread that waits for the device
> @@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
> }
>
> static void raid10_read_request(struct mddev *mddev, struct bio *bio,
> - struct r10bio *r10_bio)
> + struct r10bio *r10_bio, bool io_accounting)
> {
> struct r10conf *conf = mddev->private;
> struct bio *read_bio;
> @@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
> }
> slot = r10_bio->read_slot;
>
> - if (!r10_bio->start_time &&
> - blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> - r10_bio->start_time = bio_start_io_acct(bio);
> + if (io_accounting) {
> + md_account_bio(mddev, &bio);
> + r10_bio->master_bio = bio;
> + }
> read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
>
> r10_bio->devs[slot].bio = read_bio;
> @@ -1543,8 +1542,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
> r10_bio->master_bio = bio;
> }
>
> - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
> - r10_bio->start_time = bio_start_io_acct(bio);
> + md_account_bio(mddev, &bio);
> + r10_bio->master_bio = bio;
> atomic_set(&r10_bio->remaining, 1);
> md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
>
> @@ -1571,12 +1570,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
> r10_bio->sector = bio->bi_iter.bi_sector;
> r10_bio->state = 0;
> r10_bio->read_slot = -1;
> - r10_bio->start_time = 0;
> memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
> conf->geo.raid_disks);
>
> if (bio_data_dir(bio) == READ)
> - raid10_read_request(mddev, bio, r10_bio);
> + raid10_read_request(mddev, bio, r10_bio, true);
> else
> raid10_write_request(mddev, bio, r10_bio);
> }
> @@ -2985,7 +2983,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
>
> rdev_dec_pending(rdev, mddev);
> r10_bio->state = 0;
> - raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
> + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
> /*
> * allow_barrier after re-submit to ensure no sync io
> * can be issued while regular io pending.
> diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
> index 63e48b11b552..2e75e88d0802 100644
> --- a/drivers/md/raid10.h
> +++ b/drivers/md/raid10.h
> @@ -123,7 +123,6 @@ struct r10bio {
> sector_t sector; /* virtual sector number */
> int sectors;
> unsigned long state;
> - unsigned long start_time;
> struct mddev *mddev;
> /*
> * original bio going to /dev/mdx
> --
> 2.39.2
>
Reviewed-by: Xiao Ni <[email protected]>
Dear Yu,
Thank you for your patch.
Am 19.06.23 um 22:48 schrieb Yu Kuai:
> From: Yu Kuai <[email protected]>
>
> Io will only be accounted as done from raid5_align_endio() if the io
> succeed, and io inflight counter will be leaked if such io failed.
succeed*s* or succeed*ed*?
> Fix this problem by switching to use md_account_bio() for io accounting.
How can this be tested?
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> drivers/md/raid5.c | 29 ++++++++---------------------
> 1 file changed, 8 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cef0b400b2ee..4cdb35e54251 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
> */
> static void raid5_align_endio(struct bio *bi)
> {
> - struct md_io_clone *md_io_clone = bi->bi_private;
> - struct bio *raid_bi = md_io_clone->orig_bio;
> - struct mddev *mddev;
> - struct r5conf *conf;
> - struct md_rdev *rdev;
> + struct bio *raid_bi = bi->bi_private;
> + struct md_rdev *rdev = (void *)raid_bi->bi_next;
> + struct mddev *mddev = rdev->mddev;
> + struct r5conf *conf = mddev->private;
> blk_status_t error = bi->bi_status;
> - unsigned long start_time = md_io_clone->start_time;
>
> bio_put(bi);
> -
> - rdev = (void*)raid_bi->bi_next;
> raid_bi->bi_next = NULL;
> - mddev = rdev->mddev;
> - conf = mddev->private;
> -
This looks like unnecessary refactoring. No idea what the preferred
style for the subsystem is though. If it is wanted, maybe make it a
separate commit?
> rdev_dec_pending(rdev, conf->mddev);
>
> if (!error) {
> - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
> - bio_end_io_acct(raid_bi, start_time);
> bio_endio(raid_bi);
> if (atomic_dec_and_test(&conf->active_aligned_reads))
> wake_up(&conf->wait_for_quiescent);
> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> struct md_rdev *rdev;
> sector_t sector, end_sector, first_bad;
> int bad_sectors, dd_idx;
> - struct md_io_clone *md_io_clone;
> bool did_inc;
>
> if (!in_chunk_boundary(mddev, raid_bio)) {
> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
> return 0;
> }
>
> - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> - &mddev->io_clone_set);
> - md_io_clone = container_of(align_bio, struct md_io_clone, bio_clone);
> + md_account_bio(mddev, &raid_bio);
> raid_bio->bi_next = (void *)rdev;
> - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
> - md_io_clone->start_time = bio_start_io_acct(raid_bio);
> - md_io_clone->orig_bio = raid_bio;
>
> + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
> + &mddev->bio_set);
> align_bio->bi_end_io = raid5_align_endio;
> - align_bio->bi_private = md_io_clone;
> + align_bio->bi_private = raid_bio;
> align_bio->bi_iter.bi_sector = sector;
>
> /* No reshape active, so we can trust rdev->data_offset */
Kind regards,
Paul
Hi,
在 2023/06/20 17:07, Xiao Ni 写道:
> On Mon, Jun 19, 2023 at 8:49 PM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> Two problems can be fixed this way:
>>
>> 1) 'active_io' will represent inflight io instead of io that is
>> dispatching.
>>
>> 2) If io accounting is enabled or disabled while io is still inflight,
>> bio_start_io_acct() and bio_end_io_acct() is not balanced and io
>> inflight counter will be leaked.
>>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> drivers/md/raid1.c | 14 ++++++--------
>> drivers/md/raid1.h | 1 -
>> 2 files changed, 6 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index dd25832eb045..06fa1580501f 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
>> if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
>> bio->bi_status = BLK_STS_IOERR;
>>
>> - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> - bio_end_io_acct(bio, r1_bio->start_time);
>> bio_endio(bio);
>> }
>>
>> @@ -1303,10 +1301,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
>> }
>>
>> r1_bio->read_disk = rdisk;
>> -
>> - if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> - r1_bio->start_time = bio_start_io_acct(bio);
>> -
>> + if (!r1bio_existed) {
>> + md_account_bio(mddev, &bio);
>> + r1_bio->master_bio = bio;
>> + }
>> read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
>> &mddev->bio_set);
>>
>> @@ -1500,8 +1498,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>> r1_bio->sectors = max_sectors;
>> }
>>
>> - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
>> - r1_bio->start_time = bio_start_io_acct(bio);
>> + md_account_bio(mddev, &bio);
>> + r1_bio->master_bio = bio;
>> atomic_set(&r1_bio->remaining, 1);
>> atomic_set(&r1_bio->behind_remaining, 0);
>>
>> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
>> index 468f189da7a0..14d4211a123a 100644
>> --- a/drivers/md/raid1.h
>> +++ b/drivers/md/raid1.h
>> @@ -157,7 +157,6 @@ struct r1bio {
>> sector_t sector;
>> int sectors;
>> unsigned long state;
>> - unsigned long start_time;
>> struct mddev *mddev;
>> /*
>> * original bio going to /dev/mdx
>> --
>> 2.39.2
>>
>
> Hi Kuai
>
> After this patch, raid1 will have one more memory allocation in the
> I/O path. Not sure if it can affect performance. Beside this, the
> patch is good for me.
Yes, I'm aware of this additional memory allocation, however, raid1(and
similar to other levels) already need to allocate r1bio and some bios(1
for read, and copies for write), so this is not a none -> new case,
it's just a allocate 2 -> allocate 3 case.
I think performance under memory pressure are both bad with or without
this patch, and one one bio clone latency without memory reclaim should
be fine.
Thanks,
Kuai
>
> Reviewed-by: Xiao Ni <[email protected]>
>
> .
>
On Mon, Jun 19, 2023 at 5:49 AM Yu Kuai <[email protected]> wrote:
>
> From: Yu Kuai <[email protected]>
>
> This patchset do following things:
> - make io accounting for all raid levels consistent, patch 1, 3-5;
> - enable io accounting for all raid levels, patch 6-8;
> - hold 'active_io' before io start to dispatch, and release 'active_io'
> when io is done, make mddev_suspend() will wait for io to be done, patch 2
This set looks good to me (pending more tests). Please respin with
feedback from folks. We need to move fast to catch the upcoming
merge window.
Thanks,
Song
Hi,
在 2023/06/20 16:35, Xiao Ni 写道:
> On Mon, Jun 19, 2023 at 8:50 PM Yu Kuai <[email protected]> wrote:
>>
>> From: Yu Kuai <[email protected]>
>>
>> 'io_acct_set' is only used for raid0 and raid456, prepare to use it for
>> raid1 and raid10, so that io accounting from different levels can be
>> consistent.
>>
>> By the way, follow up patches will also use this io clone mechanism to
>> make sure 'active_io' represents in flight io, not io that is dispatching,
>> so that mddev_suspend will wait for io to be done as desgined.
>
> Hi Kuai
>
> typo error: s/desgined/designed/g
>
> Before this patch the personality uses ->quiesce method to wait until
> all inflight ios come back. But I like this solution. It makes the
> codes simpler. Not sure if it can cause problems because it changes
> the meaning of ->active_io. I'm doing regression tests to check.
Yes, actually this is the first step that I'm tring to synchronize io
and raid configuration, and I'm planing to use 'active_io' to check if
normal io exist, following are follow up plans:
1. add a new counter(perhaps we can reuse queue->q_usage_counter), and
grab it for sync io;
2. refactor and expand 'pers->quiesce', allow normal io and sync io
to be both quiesced;
3. call 'pers->quiesce' before raid configuration;
4. add a new helper to iterate rdev, grab the new counter first;
5. a lot of cleanups;
Thanks,
Kuai
>
>>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> drivers/md/md.c | 27 ++++++++++-----------------
>> drivers/md/md.h | 2 --
>> drivers/md/raid0.c | 16 ++--------------
>> drivers/md/raid5.c | 41 +++++++++++------------------------------
>> 4 files changed, 23 insertions(+), 63 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 8d62f85d2ab0..42347289195a 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5886,6 +5886,13 @@ int md_run(struct mddev *mddev)
>> goto exit_bio_set;
>> }
>>
>> + if (!bioset_initialized(&mddev->io_acct_set)) {
>> + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
>> + offsetof(struct md_io_acct, bio_clone), 0);
>> + if (err)
>> + goto exit_sync_set;
>> + }
>> +
>> spin_lock(&pers_lock);
>> pers = find_pers(mddev->level, mddev->clevel);
>> if (!pers || !try_module_get(pers->owner)) {
>> @@ -6063,6 +6070,8 @@ int md_run(struct mddev *mddev)
>> module_put(pers->owner);
>> md_bitmap_destroy(mddev);
>> abort:
>> + bioset_exit(&mddev->io_acct_set);
>> +exit_sync_set:
>> bioset_exit(&mddev->sync_set);
>> exit_bio_set:
>> bioset_exit(&mddev->bio_set);
>> @@ -6286,6 +6295,7 @@ static void __md_stop(struct mddev *mddev)
>> percpu_ref_exit(&mddev->active_io);
>> bioset_exit(&mddev->bio_set);
>> bioset_exit(&mddev->sync_set);
>> + bioset_exit(&mddev->io_acct_set);
>> }
>>
>> void md_stop(struct mddev *mddev)
>> @@ -8651,23 +8661,6 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>> }
>> EXPORT_SYMBOL_GPL(md_submit_discard_bio);
>>
>> -int acct_bioset_init(struct mddev *mddev)
>> -{
>> - int err = 0;
>> -
>> - if (!bioset_initialized(&mddev->io_acct_set))
>> - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
>> - offsetof(struct md_io_acct, bio_clone), 0);
>> - return err;
>> -}
>> -EXPORT_SYMBOL_GPL(acct_bioset_init);
>> -
>> -void acct_bioset_exit(struct mddev *mddev)
>> -{
>> - bioset_exit(&mddev->io_acct_set);
>> -}
>> -EXPORT_SYMBOL_GPL(acct_bioset_exit);
>> -
>> static void md_end_io_acct(struct bio *bio)
>> {
>> struct md_io_acct *md_io_acct = bio->bi_private;
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 7cab9c7c45b8..11299d94b239 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -776,8 +776,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
>> extern void md_finish_reshape(struct mddev *mddev);
>> void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
>> struct bio *bio, sector_t start, sector_t size);
>> -int acct_bioset_init(struct mddev *mddev);
>> -void acct_bioset_exit(struct mddev *mddev);
>> void md_account_bio(struct mddev *mddev, struct bio **bio);
>>
>> extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
>> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
>> index f8ee9a95e25d..38d9209cada1 100644
>> --- a/drivers/md/raid0.c
>> +++ b/drivers/md/raid0.c
>> @@ -365,7 +365,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
>> struct r0conf *conf = priv;
>>
>> free_conf(mddev, conf);
>> - acct_bioset_exit(mddev);
>> }
>>
>> static int raid0_run(struct mddev *mddev)
>> @@ -380,16 +379,11 @@ static int raid0_run(struct mddev *mddev)
>> if (md_check_no_bitmap(mddev))
>> return -EINVAL;
>>
>> - if (acct_bioset_init(mddev)) {
>> - pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
>> - return -ENOMEM;
>> - }
>> -
>> /* if private is not null, we are here after takeover */
>> if (mddev->private == NULL) {
>> ret = create_strip_zones(mddev, &conf);
>> if (ret < 0)
>> - goto exit_acct_set;
>> + return ret;
>> mddev->private = conf;
>> }
>> conf = mddev->private;
>> @@ -420,15 +414,9 @@ static int raid0_run(struct mddev *mddev)
>>
>> ret = md_integrity_register(mddev);
>> if (ret)
>> - goto free;
>> + free_conf(mddev, conf);
>>
>> return ret;
>> -
>> -free:
>> - free_conf(mddev, conf);
>> -exit_acct_set:
>> - acct_bioset_exit(mddev);
>> - return ret;
>> }
>>
>> static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index f8bc74e16811..29cf5455d7a5 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -7787,19 +7787,12 @@ static int raid5_run(struct mddev *mddev)
>> struct md_rdev *rdev;
>> struct md_rdev *journal_dev = NULL;
>> sector_t reshape_offset = 0;
>> - int i, ret = 0;
>> + int i;
>> long long min_offset_diff = 0;
>> int first = 1;
>>
>> - if (acct_bioset_init(mddev)) {
>> - pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
>> + if (mddev_init_writes_pending(mddev) < 0)
>> return -ENOMEM;
>> - }
>> -
>> - if (mddev_init_writes_pending(mddev) < 0) {
>> - ret = -ENOMEM;
>> - goto exit_acct_set;
>> - }
>>
>> if (mddev->recovery_cp != MaxSector)
>> pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
>> @@ -7830,8 +7823,7 @@ static int raid5_run(struct mddev *mddev)
>> (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
>> pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>>
>> if (mddev->reshape_position != MaxSector) {
>> @@ -7856,15 +7848,13 @@ static int raid5_run(struct mddev *mddev)
>> if (journal_dev) {
>> pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>>
>> if (mddev->new_level != mddev->level) {
>> pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>> old_disks = mddev->raid_disks - mddev->delta_disks;
>> /* reshape_position must be on a new-stripe boundary, and one
>> @@ -7880,8 +7870,7 @@ static int raid5_run(struct mddev *mddev)
>> if (sector_div(here_new, chunk_sectors * new_data_disks)) {
>> pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>> reshape_offset = here_new * chunk_sectors;
>> /* here_new is the stripe we will write to */
>> @@ -7903,8 +7892,7 @@ static int raid5_run(struct mddev *mddev)
>> else if (mddev->ro == 0) {
>> pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>> } else if (mddev->reshape_backwards
>> ? (here_new * chunk_sectors + min_offset_diff <=
>> @@ -7914,8 +7902,7 @@ static int raid5_run(struct mddev *mddev)
>> /* Reading from the same stripe as writing to - bad */
>> pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
>> mdname(mddev));
>> - ret = -EINVAL;
>> - goto exit_acct_set;
>> + return -EINVAL;
>> }
>> pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
>> /* OK, we should be able to continue; */
>> @@ -7939,10 +7926,8 @@ static int raid5_run(struct mddev *mddev)
>> else
>> conf = mddev->private;
>>
>> - if (IS_ERR(conf)) {
>> - ret = PTR_ERR(conf);
>> - goto exit_acct_set;
>> - }
>> + if (IS_ERR(conf))
>> + return PTR_ERR(conf);
>>
>> if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
>> if (!journal_dev) {
>> @@ -8140,10 +8125,7 @@ static int raid5_run(struct mddev *mddev)
>> free_conf(conf);
>> mddev->private = NULL;
>> pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
>> - ret = -EIO;
>> -exit_acct_set:
>> - acct_bioset_exit(mddev);
>> - return ret;
>> + return -EIO;
>> }
>>
>> static void raid5_free(struct mddev *mddev, void *priv)
>> @@ -8151,7 +8133,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
>> struct r5conf *conf = priv;
>>
>> free_conf(conf);
>> - acct_bioset_exit(mddev);
>> mddev->to_remove = &raid5_attrs_group;
>> }
>>
>> --
>> 2.39.2
>>
>
> The patch is good for me.
>
> Reviewed-by: Xiao Ni <[email protected]>
>
> .
>
Hi,
在 2023/06/20 17:57, Paul Menzel 写道:
> Dear Yu,
>
>
> Thank you for your patch.
>
> Am 19.06.23 um 22:48 schrieb Yu Kuai:
>> From: Yu Kuai <[email protected]>
>>
>> Io will only be accounted as done from raid5_align_endio() if the io
>> succeed, and io inflight counter will be leaked if such io failed.
>
> succeed*s* or succeed*ed*?
I'll up date this.
>
>> Fix this problem by switching to use md_account_bio() for io accounting.
>
> How can this be tested?
>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>> drivers/md/raid5.c | 29 ++++++++---------------------
>> 1 file changed, 8 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index cef0b400b2ee..4cdb35e54251 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -5468,26 +5468,17 @@ static struct bio
>> *remove_bio_from_retry(struct r5conf *conf,
>> */
>> static void raid5_align_endio(struct bio *bi)
>> {
>> - struct md_io_clone *md_io_clone = bi->bi_private;
>> - struct bio *raid_bi = md_io_clone->orig_bio;
>> - struct mddev *mddev;
>> - struct r5conf *conf;
>> - struct md_rdev *rdev;
>> + struct bio *raid_bi = bi->bi_private;
>> + struct md_rdev *rdev = (void *)raid_bi->bi_next;
>> + struct mddev *mddev = rdev->mddev;
>> + struct r5conf *conf = mddev->private;
>> blk_status_t error = bi->bi_status;
>> - unsigned long start_time = md_io_clone->start_time;
>> bio_put(bi);
>> -
>> - rdev = (void*)raid_bi->bi_next;
>> raid_bi->bi_next = NULL;
>> - mddev = rdev->mddev;
>> - conf = mddev->private;
>> -
>
> This looks like unnecessary refactoring. No idea what the preferred
> style for the subsystem is though. If it is wanted, maybe make it a
> separate commit?
You mean that I initialize 'rdev' and 'mdev' while declaration?
I think code is cleaner this way, and this is too tiny to make a patch
for this... I will keep this for now. ????
Thanks,
Kuai
>
>> rdev_dec_pending(rdev, conf->mddev);
>> if (!error) {
>> - if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
>> - bio_end_io_acct(raid_bi, start_time);
>> bio_endio(raid_bi);
>> if (atomic_dec_and_test(&conf->active_aligned_reads))
>> wake_up(&conf->wait_for_quiescent);
>> @@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev
>> *mddev, struct bio *raid_bio)
>> struct md_rdev *rdev;
>> sector_t sector, end_sector, first_bad;
>> int bad_sectors, dd_idx;
>> - struct md_io_clone *md_io_clone;
>> bool did_inc;
>> if (!in_chunk_boundary(mddev, raid_bio)) {
>> @@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev
>> *mddev, struct bio *raid_bio)
>> return 0;
>> }
>> - align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
>> - &mddev->io_clone_set);
>> - md_io_clone = container_of(align_bio, struct md_io_clone,
>> bio_clone);
>> + md_account_bio(mddev, &raid_bio);
>> raid_bio->bi_next = (void *)rdev;
>> - if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
>> - md_io_clone->start_time = bio_start_io_acct(raid_bio);
>> - md_io_clone->orig_bio = raid_bio;
>> + align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
>> + &mddev->bio_set);
>> align_bio->bi_end_io = raid5_align_endio;
>> - align_bio->bi_private = md_io_clone;
>> + align_bio->bi_private = raid_bio;
>> align_bio->bi_iter.bi_sector = sector;
>> /* No reshape active, so we can trust rdev->data_offset */
>
>
> Kind regards,
>
> Paul
>
> .
>