From: Yu Kuai <[email protected]>
Changes in v2:
- just replace blk_mq_in_flight() with part_in_flight() for
diskstats_show() and part_stat_show() in patch 2;
Yu Kuai (2):
block: support to account io_ticks precisely
block: fix that util can be greater than 100%
block/blk-core.c | 9 +++++----
block/blk-merge.c | 2 ++
block/blk-mq.c | 4 ++++
block/blk.h | 1 +
block/genhd.c | 14 +++-----------
5 files changed, 15 insertions(+), 15 deletions(-)
--
2.39.2
From: Yu Kuai <[email protected]>
util means the percentage that disk has IO, and theoretically it should
not be greater than 100%. However, there is a gap for rq-based disk:
io_ticks will be updated when rq is allocated, however, before such rq
dispatch to driver, it will not be account as inflight from
blk_mq_start_request() hence diskstats_show()/part_stat_show() will not
update io_ticks. For example:
1) at t0, issue a new IO, rq is allocated, and blk_account_io_start()
update io_ticks;
2) something is wrong with drivers, and the rq can't be dispatched;
3) at t0 + 10s, drivers recovers and rq is dispatched and done, io_ticks
is updated;
Then if user is using "iostat 1" to monitor "util", between t0 - t0+9s,
util will be zero, and between t0+9s - t0+10s, util will be 1000%.
Fix this problem by updating io_ticks from diskstats_show() and
part_stat_show() if there are rq allocated.
Signed-off-by: Yu Kuai <[email protected]>
---
block/genhd.c | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index 8f1163d2d171..7f39fbe60753 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -951,15 +951,10 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
- struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
- if (queue_is_mq(q))
- inflight = blk_mq_in_flight(q, bdev);
- else
- inflight = part_in_flight(bdev);
-
+ inflight = part_in_flight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@@ -1256,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- if (queue_is_mq(gp->queue))
- inflight = blk_mq_in_flight(gp->queue, hd);
- else
- inflight = part_in_flight(hd);
+ inflight = part_in_flight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
--
2.39.2
From: Yu Kuai <[email protected]>
Currently, io_ticks is accounted based on sampling, specifically
update_io_ticks() will always account io_ticks by 1 jiffies from
bdev_start_io_acct()/blk_account_io_start(), and the result can be
inaccurate, for example(HZ is 250):
Test script:
fio -filename=/dev/sda -bs=4k -rw=write -direct=1 -name=test -thinktime=4ms
Test result: util is about 90%, while the disk is really idle.
This behaviour is introduced by commit 5b18b5a73760 ("block: delete
part_round_stats and switch to less precise counting"), however, there
was a key point that is missed that this patch also improve performance
a lot:
Before the commit:
part_round_stats:
if (part->stamp != now)
stats |= 1;
part_in_flight()
-> there can be lots of task here in 1 jiffies.
part_round_stats_single()
__part_stat_add()
part->stamp = now;
After the commit:
update_io_ticks:
stamp = part->bd_stamp;
if (time_after(now, stamp))
if (try_cmpxchg())
__part_stat_add()
-> only one task can reach here in 1 jiffies.
Hence in order to account io_ticks precisely, we only need to know if
there are IO inflight at most once in one jiffies. Noted that for
rq-based device, iterating tags should not be used here because
'tags->lock' is grabbed in blk_mq_find_and_get_req(), hence
part_stat_lock_inc/dec() and part_in_flight() is used to trace inflight.
The additional overhead is quite little:
- per cpu add/dec for each IO for rq-based device;
- per cpu sum for each jiffies;
And it's verified by null-blk that there are no performance degration
under heavy IO pressure.
Fixes: 5b18b5a73760 ("block: delete part_round_stats and switch to less precise counting")
Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-core.c | 9 +++++----
block/blk-merge.c | 2 ++
block/blk-mq.c | 4 ++++
block/blk.h | 1 +
block/genhd.c | 2 +-
5 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 47400a4fe851..9ead80d6c6f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -978,10 +978,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
+ if (unlikely(time_after(now, stamp)) &&
+ likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ (end || part_in_flight(part)))
+ __part_stat_add(part, io_ticks, now - stamp);
+
if (part->bd_partno) {
part = bdev_whole(part);
goto again;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f64115d72f3d..8534c35e0497 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -780,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
if (blk_do_io_stat(req)) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9f677ea85a52..8e01e4b32e10 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -996,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(req->part, jiffies, true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_local_dec(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
@@ -1018,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
part_stat_lock();
update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part,
+ in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
diff --git a/block/blk.h b/block/blk.h
index 0e46c5d30d5a..6e94c10af798 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -366,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq)
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
diff --git a/block/genhd.c b/block/genhd.c
index dec2ee338fb4..8f1163d2d171 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
}
}
-static unsigned int part_in_flight(struct block_device *part)
+unsigned int part_in_flight(struct block_device *part)
{
unsigned int inflight = 0;
int cpu;
--
2.39.2
On Thu, 09 May 2024 20:37:15 +0800, Yu Kuai wrote:
> Changes in v2:
> - just replace blk_mq_in_flight() with part_in_flight() for
> diskstats_show() and part_stat_show() in patch 2;
>
> Yu Kuai (2):
> block: support to account io_ticks precisely
> block: fix that util can be greater than 100%
>
> [...]
Applied, thanks!
[1/2] block: support to account io_ticks precisely
commit: 99dc422335d8b2bd4d105797241d3e715bae90e9
[2/2] block: fix that util can be greater than 100%
commit: 7be835694daebbb4adffbc461519081aa0cf28e1
Best regards,
--
Jens Axboe