2023-02-23 08:49:08

by Yu Kuai

[permalink] [raw]
Subject: [PATCH -next RFC] block: count 'ios' and 'sectors' when io is done for bio-based device

From: Yu Kuai <[email protected]>

While using iostat for raid, I observed very strange 'await'
occasionally, and turns out it's due to that 'ios' and 'sectors' is
counted in bdev_start_io_acct(), while 'nsecs' is counted in
bdev_end_io_acct(). I'm not sure why they are ccounted like that
but I think this behaviour is obviously wrong because user will get
wrong disk stats.

Fix the problem by counting 'ios' and 'sectors' when io is done, like
what rq-based device does.

Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help function")
Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-core.c | 16 ++++++----------
drivers/md/dm.c | 6 +++---
drivers/nvme/host/multipath.c | 8 ++++----
include/linux/blkdev.h | 5 ++---
4 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 82b5b2c53f1e..fe1d320f5f07 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -953,16 +953,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
}
}

-unsigned long bdev_start_io_acct(struct block_device *bdev,
- unsigned int sectors, enum req_op op,
+unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
unsigned long start_time)
{
- const int sgrp = op_stat_group(op);
-
part_stat_lock();
update_io_ticks(bdev, start_time, false);
- part_stat_inc(bdev, ios[sgrp]);
- part_stat_add(bdev, sectors[sgrp], sectors);
part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();

@@ -978,13 +973,12 @@ EXPORT_SYMBOL(bdev_start_io_acct);
*/
unsigned long bio_start_io_acct(struct bio *bio)
{
- return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
- bio_op(bio), jiffies);
+ return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
- unsigned long start_time)
+ unsigned int sectors, unsigned long start_time)
{
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
@@ -992,6 +986,8 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,

part_stat_lock();
update_io_ticks(bdev, now, true);
+ part_stat_inc(bdev, ios[sgrp]);
+ part_stat_add(bdev, sectors[sgrp], sectors);
part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
@@ -1001,7 +997,7 @@ EXPORT_SYMBOL(bdev_end_io_acct);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
struct block_device *orig_bdev)
{
- bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
+ bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index eace45a18d45..f5cc330bb549 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end)
sectors = io->sectors;

if (!end)
- bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
- start_time);
+ bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
else
- bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
+ bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
+ start_time);

if (static_branch_unlikely(&stats_enabled) &&
unlikely(dm_stats_used(&md->stats))) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63..9171452e2f6d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq)
return;

nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
- nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
- blk_rq_bytes(rq) >> SECTOR_SHIFT,
- req_op(rq), jiffies);
+ nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
+ jiffies);
}
EXPORT_SYMBOL_GPL(nvme_mpath_start_request);

@@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq)
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
- nvme_req(rq)->start_time);
+ blk_rq_bytes(rq) >> SECTOR_SHIFT,
+ nvme_req(rq)->start_time);
}

void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1aee08f8c18..941304f17492 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter)
wake_up_process(waiter);
}

-unsigned long bdev_start_io_acct(struct block_device *bdev,
- unsigned int sectors, enum req_op op,
+unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
unsigned long start_time);
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
- unsigned long start_time);
+ unsigned int sectors, unsigned long start_time);

unsigned long bio_start_io_acct(struct bio *bio);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
--
2.31.1



2023-02-28 01:01:43

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH -next RFC] block: count 'ios' and 'sectors' when io is done for bio-based device

Hi,

friendly ping ...

Thanks,
Kuai

?? 2023/02/23 17:12, Yu Kuai д??:
> From: Yu Kuai <[email protected]>
>
> While using iostat for raid, I observed very strange 'await'
> occasionally, and turns out it's due to that 'ios' and 'sectors' is
> counted in bdev_start_io_acct(), while 'nsecs' is counted in
> bdev_end_io_acct(). I'm not sure why they are ccounted like that
> but I think this behaviour is obviously wrong because user will get
> wrong disk stats.
>
> Fix the problem by counting 'ios' and 'sectors' when io is done, like
> what rq-based device does.
>
> Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help function")
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> block/blk-core.c | 16 ++++++----------
> drivers/md/dm.c | 6 +++---
> drivers/nvme/host/multipath.c | 8 ++++----
> include/linux/blkdev.h | 5 ++---
> 4 files changed, 15 insertions(+), 20 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 82b5b2c53f1e..fe1d320f5f07 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -953,16 +953,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
> }
> }
>
> -unsigned long bdev_start_io_acct(struct block_device *bdev,
> - unsigned int sectors, enum req_op op,
> +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
> unsigned long start_time)
> {
> - const int sgrp = op_stat_group(op);
> -
> part_stat_lock();
> update_io_ticks(bdev, start_time, false);
> - part_stat_inc(bdev, ios[sgrp]);
> - part_stat_add(bdev, sectors[sgrp], sectors);
> part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
> part_stat_unlock();
>
> @@ -978,13 +973,12 @@ EXPORT_SYMBOL(bdev_start_io_acct);
> */
> unsigned long bio_start_io_acct(struct bio *bio)
> {
> - return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
> - bio_op(bio), jiffies);
> + return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
> }
> EXPORT_SYMBOL_GPL(bio_start_io_acct);
>
> void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
> - unsigned long start_time)
> + unsigned int sectors, unsigned long start_time)
> {
> const int sgrp = op_stat_group(op);
> unsigned long now = READ_ONCE(jiffies);
> @@ -992,6 +986,8 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
>
> part_stat_lock();
> update_io_ticks(bdev, now, true);
> + part_stat_inc(bdev, ios[sgrp]);
> + part_stat_add(bdev, sectors[sgrp], sectors);
> part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
> part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
> part_stat_unlock();
> @@ -1001,7 +997,7 @@ EXPORT_SYMBOL(bdev_end_io_acct);
> void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
> struct block_device *orig_bdev)
> {
> - bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
> + bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
> }
> EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
>
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index eace45a18d45..f5cc330bb549 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end)
> sectors = io->sectors;
>
> if (!end)
> - bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
> - start_time);
> + bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
> else
> - bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
> + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
> + start_time);
>
> if (static_branch_unlikely(&stats_enabled) &&
> unlikely(dm_stats_used(&md->stats))) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index fc39d01e7b63..9171452e2f6d 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq)
> return;
>
> nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
> - nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
> - blk_rq_bytes(rq) >> SECTOR_SHIFT,
> - req_op(rq), jiffies);
> + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
> + jiffies);
> }
> EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
>
> @@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq)
> if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> return;
> bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
> - nvme_req(rq)->start_time);
> + blk_rq_bytes(rq) >> SECTOR_SHIFT,
> + nvme_req(rq)->start_time);
> }
>
> void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index d1aee08f8c18..941304f17492 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter)
> wake_up_process(waiter);
> }
>
> -unsigned long bdev_start_io_acct(struct block_device *bdev,
> - unsigned int sectors, enum req_op op,
> +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
> unsigned long start_time);
> void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
> - unsigned long start_time);
> + unsigned int sectors, unsigned long start_time);
>
> unsigned long bio_start_io_acct(struct bio *bio);
> void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
>


2023-03-07 10:45:41

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH -next RFC] block: count 'ios' and 'sectors' when io is done for bio-based device

Hi,

在 2023/02/28 9:01, Yu Kuai 写道:
> Hi,
>
> friendly ping ...
>
> Thanks,
> Kuai
>
> 在 2023/02/23 17:12, Yu Kuai 写道:
>> From: Yu Kuai <[email protected]>
>>
>> While using iostat for raid, I observed very strange 'await'
>> occasionally, and turns out it's due to that 'ios' and 'sectors' is
>> counted in bdev_start_io_acct(), while 'nsecs' is counted in
>> bdev_end_io_acct(). I'm not sure why they are ccounted like that
>> but I think this behaviour is obviously wrong because user will get
>> wrong disk stats.
>>
>> Fix the problem by counting 'ios' and 'sectors' when io is done, like
>> what rq-based device does.

Can anyone help to review this change? Or is there any reason to count
'ios' and 'sectors' when io is started?

Thanks,
Kuai
>>
>> Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help
>> function")
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>>   block/blk-core.c              | 16 ++++++----------
>>   drivers/md/dm.c               |  6 +++---
>>   drivers/nvme/host/multipath.c |  8 ++++----
>>   include/linux/blkdev.h        |  5 ++---
>>   4 files changed, 15 insertions(+), 20 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 82b5b2c53f1e..fe1d320f5f07 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -953,16 +953,11 @@ void update_io_ticks(struct block_device *part,
>> unsigned long now, bool end)
>>       }
>>   }
>> -unsigned long bdev_start_io_acct(struct block_device *bdev,
>> -                 unsigned int sectors, enum req_op op,
>> +unsigned long bdev_start_io_acct(struct block_device *bdev, enum
>> req_op op,
>>                    unsigned long start_time)
>>   {
>> -    const int sgrp = op_stat_group(op);
>> -
>>       part_stat_lock();
>>       update_io_ticks(bdev, start_time, false);
>> -    part_stat_inc(bdev, ios[sgrp]);
>> -    part_stat_add(bdev, sectors[sgrp], sectors);
>>       part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
>>       part_stat_unlock();
>> @@ -978,13 +973,12 @@ EXPORT_SYMBOL(bdev_start_io_acct);
>>    */
>>   unsigned long bio_start_io_acct(struct bio *bio)
>>   {
>> -    return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
>> -                  bio_op(bio), jiffies);
>> +    return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
>>   }
>>   EXPORT_SYMBOL_GPL(bio_start_io_acct);
>>   void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
>> -              unsigned long start_time)
>> +              unsigned int sectors, unsigned long start_time)
>>   {
>>       const int sgrp = op_stat_group(op);
>>       unsigned long now = READ_ONCE(jiffies);
>> @@ -992,6 +986,8 @@ void bdev_end_io_acct(struct block_device *bdev,
>> enum req_op op,
>>       part_stat_lock();
>>       update_io_ticks(bdev, now, true);
>> +    part_stat_inc(bdev, ios[sgrp]);
>> +    part_stat_add(bdev, sectors[sgrp], sectors);
>>       part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
>>       part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
>>       part_stat_unlock();
>> @@ -1001,7 +997,7 @@ EXPORT_SYMBOL(bdev_end_io_acct);
>>   void bio_end_io_acct_remapped(struct bio *bio, unsigned long
>> start_time,
>>                     struct block_device *orig_bdev)
>>   {
>> -    bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
>> +    bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio),
>> start_time);
>>   }
>>   EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index eace45a18d45..f5cc330bb549 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end)
>>           sectors = io->sectors;
>>       if (!end)
>> -        bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
>> -                   start_time);
>> +        bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time);
>>       else
>> -        bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
>> +        bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors,
>> +                 start_time);
>>       if (static_branch_unlikely(&stats_enabled) &&
>>           unlikely(dm_stats_used(&md->stats))) {
>> diff --git a/drivers/nvme/host/multipath.c
>> b/drivers/nvme/host/multipath.c
>> index fc39d01e7b63..9171452e2f6d 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq)
>>           return;
>>       nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
>> -    nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
>> -                    blk_rq_bytes(rq) >> SECTOR_SHIFT,
>> -                    req_op(rq), jiffies);
>> +    nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
>> req_op(rq),
>> +                              jiffies);
>>   }
>>   EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
>> @@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq)
>>       if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
>>           return;
>>       bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
>> -        nvme_req(rq)->start_time);
>> +             blk_rq_bytes(rq) >> SECTOR_SHIFT,
>> +             nvme_req(rq)->start_time);
>>   }
>>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index d1aee08f8c18..941304f17492 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct
>> task_struct *waiter)
>>           wake_up_process(waiter);
>>   }
>> -unsigned long bdev_start_io_acct(struct block_device *bdev,
>> -                 unsigned int sectors, enum req_op op,
>> +unsigned long bdev_start_io_acct(struct block_device *bdev, enum
>> req_op op,
>>                    unsigned long start_time);
>>   void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
>> -        unsigned long start_time);
>> +              unsigned int sectors, unsigned long start_time);
>>   unsigned long bio_start_io_acct(struct bio *bio);
>>   void bio_end_io_acct_remapped(struct bio *bio, unsigned long
>> start_time,
>>
>
> .
>


2023-03-15 15:25:45

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH -next RFC] block: count 'ios' and 'sectors' when io is done for bio-based device


On Thu, 23 Feb 2023 17:12:26 +0800, Yu Kuai wrote:
> While using iostat for raid, I observed very strange 'await'
> occasionally, and turns out it's due to that 'ios' and 'sectors' is
> counted in bdev_start_io_acct(), while 'nsecs' is counted in
> bdev_end_io_acct(). I'm not sure why they are ccounted like that
> but I think this behaviour is obviously wrong because user will get
> wrong disk stats.
>
> [...]

Applied, thanks!

[1/1] block: count 'ios' and 'sectors' when io is done for bio-based device
commit: 5f27571382ca42daa3e3d40d1b252bf18c2b61d2

Best regards,
--
Jens Axboe