2022-12-21 04:23:59

by Gulam Mohamed

[permalink] [raw]
Subject: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting

Change the data type of start and end time IO accounting variables in,
block layer, from "unsigned long" to "u64". This is to enable nano-seconds
granularity, in next commit, for the devices whose latency is less than
milliseconds.

Changes from V2 to V3
=====================
1. Changed all the required variables data-type to u64 as part of this
first patch
2. Create a new setting '2' for iostats in sysfs in next patch
3. Change the code to get the ktime values when iostat=2 in next patch

Signed-off-by: Gulam Mohamed <[email protected]>
---
block/blk-core.c | 24 ++++++++++++------------
block/blk.h | 2 +-
drivers/block/drbd/drbd_int.h | 2 +-
drivers/block/zram/zram_drv.c | 4 ++--
drivers/md/bcache/request.c | 10 +++++-----
drivers/md/dm-core.h | 2 +-
drivers/md/dm.c | 2 +-
drivers/md/md.h | 2 +-
drivers/md/raid1.h | 2 +-
drivers/md/raid10.h | 2 +-
drivers/md/raid5.c | 2 +-
drivers/nvdimm/btt.c | 2 +-
drivers/nvdimm/pmem.c | 2 +-
include/linux/blk_types.h | 2 +-
include/linux/blkdev.h | 12 ++++++------
include/linux/part_stat.h | 2 +-
16 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8ab21dd01cd1..5670032fe932 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -927,13 +927,13 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

-void update_io_ticks(struct block_device *part, unsigned long now, bool end)
+void update_io_ticks(struct block_device *part, u64 now, bool end)
{
- unsigned long stamp;
+ u64 stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
+ if (unlikely(time_after64(now, stamp))) {
+ if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->bd_partno) {
@@ -942,9 +942,9 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
}
}

-unsigned long bdev_start_io_acct(struct block_device *bdev,
- unsigned int sectors, enum req_op op,
- unsigned long start_time)
+u64 bdev_start_io_acct(struct block_device *bdev,
+ unsigned int sectors, enum req_op op,
+ u64 start_time)
{
const int sgrp = op_stat_group(op);

@@ -965,7 +965,7 @@ EXPORT_SYMBOL(bdev_start_io_acct);
*
* Returns the start time that should be passed back to bio_end_io_acct().
*/
-unsigned long bio_start_io_acct(struct bio *bio)
+u64 bio_start_io_acct(struct bio *bio)
{
return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
bio_op(bio), jiffies);
@@ -973,11 +973,11 @@ unsigned long bio_start_io_acct(struct bio *bio)
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
- unsigned long start_time)
+ u64 start_time)
{
const int sgrp = op_stat_group(op);
- unsigned long now = READ_ONCE(jiffies);
- unsigned long duration = now - start_time;
+ u64 now = READ_ONCE(jiffies);
+ u64 duration = (unsigned long)now -(unsigned long) start_time;

part_stat_lock();
update_io_ticks(bdev, now, true);
@@ -987,7 +987,7 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
}
EXPORT_SYMBOL(bdev_end_io_acct);

-void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+void bio_end_io_acct_remapped(struct bio *bio, u64 start_time,
struct block_device *orig_bdev)
{
bdev_end_io_acct(orig_bdev, bio_op(bio), start_time);
diff --git a/block/blk.h b/block/blk.h
index 8900001946c7..8997435ad4a0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -341,7 +341,7 @@ static inline bool blk_do_io_stat(struct request *rq)
return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}

-void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+void update_io_ticks(struct block_device *part, u64 now, bool end);

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index ae713338aa46..8e4d3b2eb99d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -236,7 +236,7 @@ struct drbd_request {
struct list_head req_pending_local;

/* for generic IO accounting */
- unsigned long start_jif;
+ u64 start_jif;

/* for DRBD internal statistics */

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 966aab902d19..da28eb83e6ed 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1581,7 +1581,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
u32 index;
struct bio_vec bvec;
struct bvec_iter iter;
- unsigned long start_time;
+ u64 start_time;

index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
offset = (bio->bi_iter.bi_sector &
@@ -1662,7 +1662,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
u32 index;
struct zram *zram;
struct bio_vec bv;
- unsigned long start_time;
+ u64 start_time;

if (PageTransHuge(page))
return -ENOTSUPP;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 39c7b607f8aa..1f9bd20dcdcf 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -476,7 +476,7 @@ struct search {
unsigned int cache_missed:1;

struct block_device *orig_bdev;
- unsigned long start_time;
+ u64 start_time;

struct btree_op op;
struct data_insert_op iop;
@@ -714,7 +714,7 @@ static void search_free(struct closure *cl)

static inline struct search *search_alloc(struct bio *bio,
struct bcache_device *d, struct block_device *orig_bdev,
- unsigned long start_time)
+ u64 start_time)
{
struct search *s;

@@ -1065,7 +1065,7 @@ static void cached_dev_nodata(struct closure *cl)

struct detached_dev_io_private {
struct bcache_device *d;
- unsigned long start_time;
+ u64 start_time;
bio_end_io_t *bi_end_io;
void *bi_private;
struct block_device *orig_bdev;
@@ -1094,7 +1094,7 @@ static void detached_dev_end_io(struct bio *bio)
}

static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
- struct block_device *orig_bdev, unsigned long start_time)
+ struct block_device *orig_bdev, u64 start_time)
{
struct detached_dev_io_private *ddip;
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
@@ -1173,7 +1173,7 @@ void cached_dev_submit_bio(struct bio *bio)
struct block_device *orig_bdev = bio->bi_bdev;
struct bcache_device *d = orig_bdev->bd_disk->private_data;
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
- unsigned long start_time;
+ u64 start_time;
int rw = bio_data_dir(bio);

if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 6c6bd24774f2..e620fd878b08 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -284,7 +284,7 @@ struct dm_io {
unsigned short magic;
blk_short_t flags;
spinlock_t lock;
- unsigned long start_time;
+ u64 start_time;
void *data;
struct dm_io *next;
struct dm_stats_aux stats_aux;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e1ea3a7bd9d9..011a85ea40da 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -494,7 +494,7 @@ static bool bio_is_flush_with_data(struct bio *bio)
static void dm_io_acct(struct dm_io *io, bool end)
{
struct dm_stats_aux *stats_aux = &io->stats_aux;
- unsigned long start_time = io->start_time;
+ u64 start_time = io->start_time;
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned int sectors;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 554a9026669a..df73c1d1d960 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -711,7 +711,7 @@ struct md_thread {

struct md_io_acct {
struct bio *orig_bio;
- unsigned long start_time;
+ u64 start_time;
struct bio bio_clone;
};

diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index ebb6788820e7..0fb5a1148745 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -157,7 +157,7 @@ struct r1bio {
sector_t sector;
int sectors;
unsigned long state;
- unsigned long start_time;
+ u64 start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 8c072ce0bc54..4cf3eec89bf3 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -123,7 +123,7 @@ struct r10bio {
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
- unsigned long start_time;
+ u64 start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b820b81d8c2..8f4364f4bda0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5474,7 +5474,7 @@ static void raid5_align_endio(struct bio *bi)
struct r5conf *conf;
struct md_rdev *rdev;
blk_status_t error = bi->bi_status;
- unsigned long start_time = md_io_acct->start_time;
+ u64 start_time = md_io_acct->start_time;

bio_put(bi);

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 0297b7882e33..8fc1d5da747c 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1442,7 +1442,7 @@ static void btt_submit_bio(struct bio *bio)
struct bio_integrity_payload *bip = bio_integrity(bio);
struct btt *btt = bio->bi_bdev->bd_disk->private_data;
struct bvec_iter iter;
- unsigned long start;
+ u64 start;
struct bio_vec bvec;
int err = 0;
bool do_acct;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 96e6e9a5f235..b5b7a709e1ab 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -202,7 +202,7 @@ static void pmem_submit_bio(struct bio *bio)
int ret = 0;
blk_status_t rc = 0;
bool do_acct;
- unsigned long start;
+ u64 start;
struct bio_vec bvec;
struct bvec_iter iter;
struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 99be590f952f..d3386ac3b470 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -41,7 +41,7 @@ struct block_device {
sector_t bd_start_sect;
sector_t bd_nr_sectors;
struct disk_stats __percpu *bd_stats;
- unsigned long bd_stamp;
+ u64 bd_stamp;
bool bd_read_only; /* read-only policy */
dev_t bd_dev;
atomic_t bd_openers;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2db2ad72af0f..ca94d690d292 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1433,14 +1433,14 @@ static inline void blk_wake_io_task(struct task_struct *waiter)
wake_up_process(waiter);
}

-unsigned long bdev_start_io_acct(struct block_device *bdev,
+u64 bdev_start_io_acct(struct block_device *bdev,
unsigned int sectors, enum req_op op,
- unsigned long start_time);
+ u64 start_time);
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
- unsigned long start_time);
+ u64 start_time);

-unsigned long bio_start_io_acct(struct bio *bio);
-void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+u64 bio_start_io_acct(struct bio *bio);
+void bio_end_io_acct_remapped(struct bio *bio, u64 start_time,
struct block_device *orig_bdev);

/**
@@ -1448,7 +1448,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
* @bio: bio to end account for
* @start_time: start time returned by bio_start_io_acct()
*/
-static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
+static inline void bio_end_io_acct(struct bio *bio, u64 start_time)
{
return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index abeba356bc3f..85c50235693c 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -10,7 +10,7 @@ struct disk_stats {
unsigned long sectors[NR_STAT_GROUPS];
unsigned long ios[NR_STAT_GROUPS];
unsigned long merges[NR_STAT_GROUPS];
- unsigned long io_ticks;
+ u64 io_ticks;
local_t in_flight[2];
};

--
2.31.1


2022-12-21 05:18:54

by Gulam Mohamed

[permalink] [raw]
Subject: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Problem Desc
============
The "iostat" user-space utility was showing %util as 100% for the disks
which has latencies less than a milli-second i.e for latencies in the
range of micro-seconds and below.

Root Cause
==========
The IO accounting in block layer is currently done by updating the
io_ticks in jiffies which is of milli-seconds granularity. Due to this,
for the devices with IO latencies less than a milli-second, the latency
will be accounted as 1 milli-second even-though its in the range of
micro-seconds. This was causing the iostat command to show %util
as 100% which is incorrect.

Recreationg of the issue
========================
Setup
-----
Devices: NVMe 24 devices
Model number: 4610 (Intel)

fio
---
[global]
bs=4K
iodepth=1
direct=1
ioengine=libaio
group_reporting
time_based
runtime=100
thinktime=1ms
numjobs=1
name=raw-write
rw=randrw
ignore_error=EIO:EIO
[job1]
filename=/dev/nvme0n1

iostat o/p
----------

Device %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
nvme3n1 0.00 0.05 0.00 75.38 0.50 0.00 0.00 100.00

Device %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
nvme3n1 0.00 0.05 0.00 74.74 0.50 0.00 0.00 100.00

Solution
========
Use ktime_get_ns() to update the disk_stats io_ticks so that the io_ticks
are updated for every io start and end times.

Issues using ktime
==================

Using ktime_get_ns() has a performance overhead as ktime will go ahead
and reads the clock everytime its called. Following test environment
was used by Jens Axboe on which t/io_uring was run which has shown a
high performance drop.

Devices
-------
SSDs: P5800X
No of devices: 24

io_uring config
---------------
Polled IO
iostats: Enabled
Reads: Random
Block Size: 512
QDepth: 128
Batch Submit: 32
Batch Complete: 32
No of Threads: 24

With the above environment and ktime patch, it has shown a performance
drop of ~25% from iostats disabled and ~19% performance drop from current
iostats enabled. This performance drop is high.

Suggestion from Jens Axboe
==========================
Jens Axboe suggested to split the bigger patch into two as follows:

1. In first patch, change all the types from unsigned long to u64, that
can be done while retaining jiffies.

2. In second patch, add an iostats == 2 setting, which enables the higher
resolution mode using ktime. We'd still default to 1, lower granularity
iostats enabled.

Fix details
===========
1. Use ktime_get_ns() to get the current nano-seconds to update the
io_ticks for start and end time stamps in block layer for io accounting

2. Create a new setting '2' in sysfs for iostats variable i.e for
/sys/block/<device-name>/queue/iostats, to enable the iostat (io
accounting) with nano-seconds (using ktime) granularity. This setting
should be enabled only if the iostat is needed with high resolution
mode as it has a high performance drop

3. Earlier available settings were 0 and 1 for disable and enable io
accounting with milli-seconds granularity (jiffies)

Testing
=======
Ran the t/io_uring command with following setup:

Devices
-------
SSDs: P4610
No of devices: 8

io_uring config
---------------
Polled IO
iostats: Enabled
Reads: Random
Block Size: 512
QDepth: 128
Batch Submit: 32
Batch Complete: 32
No of Threads: 24

io_uring o/p
------------
iostat=0, with patch: Maximum IOPS=10.09M
iostat=1, with patch: Maximum IOPS=9.84M
iostat=2, with patch: Maximum IOPS=9.48M

Changes from V2 to V3
=====================
1. Changed all the required variables data-type to u64 as a first patch
2. Create a new setting '2' for iostats in sysfs in this patch
3. Change the code to get the ktime values when iostat=2, in this patch

Signed-off-by: Gulam Mohamed <[email protected]>
---
block/blk-core.c | 26 +++++++++++++++++----
block/blk-mq.c | 4 ++--
block/blk-sysfs.c | 39 ++++++++++++++++++++++++++++++-
block/genhd.c | 18 ++++++++++----
drivers/block/drbd/drbd_debugfs.c | 12 ++++++++--
drivers/block/zram/zram_drv.c | 3 ++-
drivers/md/dm.c | 13 +++++++++--
include/linux/blkdev.h | 4 ++++
8 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5670032fe932..0b5e4eb909a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -927,6 +927,18 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

+/*
+ * Get the time based upon the available granularity for io accounting
+ * If the resolution mode is set to precise (2) i.e
+ * (/sys/block/<device>/queue/iostats = 2), then this will return time
+ * in nano-seconds else this will return time in jiffies
+ */
+u64 blk_get_iostat_ticks(struct request_queue *q)
+{
+ return (blk_queue_precise_io_stat(q) ? ktime_get_ns() : jiffies);
+}
+EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);
+
void update_io_ticks(struct block_device *part, u64 now, bool end)
{
u64 stamp;
@@ -968,20 +980,26 @@ EXPORT_SYMBOL(bdev_start_io_acct);
u64 bio_start_io_acct(struct bio *bio)
{
return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
- bio_op(bio), jiffies);
+ bio_op(bio),
+ blk_get_iostat_ticks(bio->bi_bdev->bd_queue));
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
u64 start_time)
{
+ u64 now;
+ u64 duration;
+ struct request_queue *q = bdev_get_queue(bdev);
const int sgrp = op_stat_group(op);
- u64 now = READ_ONCE(jiffies);
- u64 duration = (unsigned long)now -(unsigned long) start_time;
+ now = blk_get_iostat_ticks(q);;
+ duration = (unsigned long)now -(unsigned long) start_time;

part_stat_lock();
update_io_ticks(bdev, now, true);
- part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_add(bdev, nsecs[sgrp],
+ (blk_queue_precise_io_stat(q) ? duration :
+ jiffies_to_nsecs(duration)));
part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e6b3ccd4989..5318bf87f17e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -975,7 +975,7 @@ static void __blk_account_io_done(struct request *req, u64 now)
const int sgrp = op_stat_group(req_op(req));

part_stat_lock();
- update_io_ticks(req->part, jiffies, true);
+ update_io_ticks(req->part, blk_get_iostat_ticks(req->q), true);
part_stat_inc(req->part, ios[sgrp]);
part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
part_stat_unlock();
@@ -1007,7 +1007,7 @@ static void __blk_account_io_start(struct request *rq)
rq->part = rq->q->disk->part0;

part_stat_lock();
- update_io_ticks(rq->part, jiffies, false);
+ update_io_ticks(rq->part, blk_get_iostat_ticks(rq->q), false);
part_stat_unlock();
}

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 93d9e9c9a6ea..e7959782ce59 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -307,7 +307,6 @@ queue_##name##_store(struct request_queue *q, const char *page, size_t count) \

QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
-QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
#undef QUEUE_SYSFS_BIT_FNS

@@ -363,6 +362,44 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
return ret;
}

+static ssize_t queue_iostats_show(struct request_queue *q, char *page)
+{
+ bool iostat = blk_queue_io_stat(q);
+ bool precise_iostat = blk_queue_precise_io_stat(q);
+
+ return queue_var_show(iostat << precise_iostat, page);
+}
+
+static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long val;
+ ssize_t ret = -EINVAL;
+
+ ret = queue_var_store(&val, page, count);
+
+ if (ret < 0)
+ return ret;
+
+ if (val == 2) {
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
+ blk_queue_flag_set(QUEUE_FLAG_PRECISE_IO_STAT, q);
+ q->disk->part0->bd_stamp = 0;
+ }
+ else if (val == 1) {
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
+ blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
+ q->disk->part0->bd_stamp = 0;
+ }
+ else if (val == 0) {
+ blk_queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
+ blk_queue_flag_clear(QUEUE_FLAG_PRECISE_IO_STAT, q);
+ q->disk->part0->bd_stamp = 0;
+ }
+
+ return ret;
+}
+
static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
{
bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
diff --git a/block/genhd.c b/block/genhd.c
index 03a96d6473e1..d034219a4683 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -951,6 +951,7 @@ ssize_t part_stat_show(struct device *dev,
struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
+ u64 stat_ioticks;

if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
@@ -959,10 +960,13 @@ ssize_t part_stat_show(struct device *dev,

if (inflight) {
part_stat_lock();
- update_io_ticks(bdev, jiffies, true);
+ update_io_ticks(bdev, blk_get_iostat_ticks(q), true);
part_stat_unlock();
}
part_stat_read_all(bdev, &stat);
+ stat_ioticks = (blk_queue_precise_io_stat(q) ?
+ div_u64(stat.io_ticks, NSEC_PER_MSEC) :
+ jiffies_to_msecs(stat.io_ticks));
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -979,7 +983,7 @@ ssize_t part_stat_show(struct device *dev,
(unsigned long long)stat.sectors[STAT_WRITE],
(unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
inflight,
- jiffies_to_msecs(stat.io_ticks),
+ (unsigned int)stat_ioticks,
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
@@ -1217,6 +1221,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
unsigned int inflight;
struct disk_stats stat;
unsigned long idx;
+ struct request_queue *q;
+ u64 stat_ioticks;

/*
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1235,12 +1241,16 @@ static int diskstats_show(struct seq_file *seqf, void *v)
else
inflight = part_in_flight(hd);

+ q = bdev_get_queue(hd);
if (inflight) {
part_stat_lock();
- update_io_ticks(hd, jiffies, true);
+ update_io_ticks(hd, blk_get_iostat_ticks(q), true);
part_stat_unlock();
}
part_stat_read_all(hd, &stat);
+ stat_ioticks = (blk_queue_precise_io_stat(q) ?
+ div_u64(stat.io_ticks, NSEC_PER_MSEC) :
+ jiffies_to_msecs(stat.io_ticks));
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
@@ -1260,7 +1270,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
(unsigned int)div_u64(stat.nsecs[STAT_WRITE],
NSEC_PER_MSEC),
inflight,
- jiffies_to_msecs(stat.io_ticks),
+ (unsigned int)stat_ioticks,
(unsigned int)div_u64(stat.nsecs[STAT_READ] +
stat.nsecs[STAT_WRITE] +
stat.nsecs[STAT_DISCARD] +
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a72c096aa5b1..af193bcc4f3a 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -97,6 +97,12 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
{
/* change anything here, fixup header below! */
unsigned int s = req->rq_state;
+ unsigned long start_time;
+ struct request_queue *q = req->device->rq_queue;
+
+ start_time = (blk_queue_precise_io_stat(q) ?
+ nsecs_to_jiffies(req->start_jif) :
+ req->start_jif);

#define RQ_HDR_1 "epoch\tsector\tsize\trw"
seq_printf(m, "0x%x\t%llu\t%u\t%s",
@@ -105,7 +111,7 @@ static void seq_print_one_request(struct seq_file *m, struct drbd_request *req,
(s & RQ_WRITE) ? "W" : "R");

#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
- seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+ seq_printf(m, "\t%d", jiffies_to_msecs(now - start_time));
seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);

@@ -171,7 +177,9 @@ static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *r
/* if the oldest request does not wait for the activity log
* it is not interesting for us here */
if (req && !(req->rq_state & RQ_IN_ACT_LOG))
- jif = req->start_jif;
+ jif = (blk_queue_precise_io_stat(device->rq_queue) ?
+ nsecs_to_jiffies(req->start_jif):
+ req->start_jif);
else
req = NULL;
spin_unlock_irq(&device->resource->req_lock);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index da28eb83e6ed..8e22487de7de 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1663,6 +1663,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
struct zram *zram;
struct bio_vec bv;
u64 start_time;
+ struct request_queue *q = bdev_get_queue(bdev);

if (PageTransHuge(page))
return -ENOTSUPP;
@@ -1682,7 +1683,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_offset = 0;

start_time = bdev_start_io_acct(bdev->bd_disk->part0,
- SECTORS_PER_PAGE, op, jiffies);
+ SECTORS_PER_PAGE, op, blk_get_iostat_ticks(q));
ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
bdev_end_io_acct(bdev->bd_disk->part0, op, start_time);
out:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 011a85ea40da..1bb58a0b8cd1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -482,7 +482,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,

u64 dm_start_time_ns_from_clone(struct bio *bio)
{
- return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ u64 start_time_ns = (blk_queue_precise_io_stat(q) ?
+ clone_to_tio(bio)->io->start_time :
+ jiffies_to_nsecs(clone_to_tio(bio)->io->start_time));
+ return start_time_ns;
}
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);

@@ -498,6 +502,11 @@ static void dm_io_acct(struct dm_io *io, bool end)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned int sectors;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+ u64 start_time = (blk_queue_precise_io_stat(q) ?
+ nsecs_to_jiffies(io->start_time) :
+ io->start_time);

/*
* If REQ_PREFLUSH set, don't account payload, it will be
@@ -589,7 +598,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
io->orig_bio = bio;
io->md = md;
spin_lock_init(&io->lock);
- io->start_time = jiffies;
+ io->start_time = blk_get_iostat_ticks(bio->bi_bdev->bd_queue);
io->flags = 0;

if (static_branch_unlikely(&stats_enabled))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ca94d690d292..0543a536c6e5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -570,6 +570,7 @@ struct request_queue {
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
+#define QUEUE_FLAG_PRECISE_IO_STAT 32 /* To enable precise io accounting */

#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
(1UL << QUEUE_FLAG_SAME_COMP) | \
@@ -578,6 +579,7 @@ struct request_queue {
void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
+u64 blk_get_iostat_ticks(struct request_queue *q);

#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
@@ -589,6 +591,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stable_writes(q) \
test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
+#define blk_queue_precise_io_stat(q) \
+ test_bit(QUEUE_FLAG_PRECISE_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_zone_resetall(q) \
test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
--
2.31.1

2022-12-21 10:24:22

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting

Hi Gulam,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-1-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting
config: riscv-randconfig-r042-20221220
compiler: riscv32-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/254be08cd76a258e2d0489fac716308b416dfe6b
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
git checkout 254be08cd76a258e2d0489fac716308b416dfe6b
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

block/blk-core.c: Assembler messages:
>> block/blk-core.c:936: Error: unrecognized opcode `lr.d s2,0(a3)'
>> block/blk-core.c:938: Error: unrecognized opcode `sc.d.rl a1,a6,0(a3)'


vim +936 block/blk-core.c

929
930 void update_io_ticks(struct block_device *part, u64 now, bool end)
931 {
932 u64 stamp;
933 again:
934 stamp = READ_ONCE(part->bd_stamp);
935 if (unlikely(time_after64(now, stamp))) {
> 936 if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
937 __part_stat_add(part, io_ticks, end ? now - stamp : 1);
> 938 }
939 if (part->bd_partno) {
940 part = bdev_whole(part);
941 goto again;
942 }
943 }
944

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (2.64 kB)
config (149.08 kB)
Download all attachments

2022-12-21 16:32:07

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Hi Gulam,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-2-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns
config: x86_64-rhel-8.3-func
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/afff703cb450ec3806aa1a27406d21fc4eaa1f43
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
git checkout afff703cb450ec3806aa1a27406d21fc4eaa1f43
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=x86_64 olddefconfig
make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/md/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

drivers/md/dm.c: In function 'dm_io_acct':
>> drivers/md/dm.c:507:13: error: redefinition of 'start_time'
507 | u64 start_time = (blk_queue_precise_io_stat(q) ?
| ^~~~~~~~~~
drivers/md/dm.c:501:13: note: previous definition of 'start_time' with type 'u64' {aka 'long long unsigned int'}
501 | u64 start_time = io->start_time;
| ^~~~~~~~~~


vim +/start_time +507 drivers/md/dm.c

497
498 static void dm_io_acct(struct dm_io *io, bool end)
499 {
500 struct dm_stats_aux *stats_aux = &io->stats_aux;
501 u64 start_time = io->start_time;
502 struct mapped_device *md = io->md;
503 struct bio *bio = io->orig_bio;
504 unsigned int sectors;
505 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
506
> 507 u64 start_time = (blk_queue_precise_io_stat(q) ?
508 nsecs_to_jiffies(io->start_time) :
509 io->start_time);
510
511 /*
512 * If REQ_PREFLUSH set, don't account payload, it will be
513 * submitted (and accounted) after this flush completes.
514 */
515 if (bio_is_flush_with_data(bio))
516 sectors = 0;
517 else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
518 sectors = bio_sectors(bio);
519 else
520 sectors = io->sectors;
521
522 if (!end)
523 bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
524 start_time);
525 else
526 bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
527
528 if (static_branch_unlikely(&stats_enabled) &&
529 unlikely(dm_stats_used(&md->stats))) {
530 sector_t sector;
531
532 if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
533 sector = bio->bi_iter.bi_sector;
534 else
535 sector = bio_end_sector(bio) - io->sector_offset;
536
537 dm_stats_account_io(&md->stats, bio_data_dir(bio),
538 sector, sectors,
539 end, start_time, stats_aux);
540 }
541 }
542

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (3.73 kB)
config (174.24 kB)
Download all attachments

2022-12-21 16:55:58

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting

Hi Gulam,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-1-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting
config: mips-maltaaprp_defconfig
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 98b13979fb05f3ed288a900deb843e7b27589e58)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install mips cross compiling tool for clang build
# apt-get install binutils-mipsel-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/254be08cd76a258e2d0489fac716308b416dfe6b
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
git checkout 254be08cd76a258e2d0489fac716308b416dfe6b
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> block/blk-core.c:936:14: error: call to '__cmpxchg64_unsupported' declared with 'error' attribute: cmpxchg64 not available; cpu_has_64bits may be false
if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
^
include/linux/atomic/atomic-instrumented.h:2016:2: note: expanded from macro 'try_cmpxchg64'
arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
^
include/linux/atomic/atomic-arch-fallback.h:161:9: note: expanded from macro 'arch_try_cmpxchg64'
___r = arch_cmpxchg64((_ptr), ___o, (_new)); \
^
arch/mips/include/asm/cmpxchg.h:315:11: note: expanded from macro 'arch_cmpxchg64'
__res = __cmpxchg64_unsupported(); \
^
>> arch/mips/include/asm/cmpxchg.h:256:3: error: instruction requires a CPU feature not currently enabled
" dsra %M0, %L0, 32 \n"
^
<inline asm>:6:2: note: instantiated into assembly here
dsra $4, $2, 32
^
In file included from block/blk-core.c:16:
In file included from include/linux/module.h:13:
In file included from include/linux/stat.h:19:
In file included from include/linux/time.h:60:
In file included from include/linux/time32.h:13:
In file included from include/linux/timex.h:67:
In file included from arch/mips/include/asm/timex.h:19:
In file included from arch/mips/include/asm/cpu-type.h:12:
In file included from include/linux/smp.h:13:
In file included from include/linux/cpumask.h:13:
In file included from include/linux/atomic.h:7:
In file included from arch/mips/include/asm/atomic.h:23:
arch/mips/include/asm/cmpxchg.h:270:3: error: instruction requires a CPU feature not currently enabled
" dins %L1, %M5, 32, 32 \n"
^
<inline asm>:11:2: note: instantiated into assembly here
dins $5, $16, 32, 32
^
>> block/blk-core.c:936:14: error: call to '__cmpxchg64_unsupported' declared with 'error' attribute: cmpxchg64 not available; cpu_has_64bits may be false
if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
^
include/linux/atomic/atomic-instrumented.h:2016:2: note: expanded from macro 'try_cmpxchg64'
arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
^
include/linux/atomic/atomic-arch-fallback.h:161:9: note: expanded from macro 'arch_try_cmpxchg64'
___r = arch_cmpxchg64((_ptr), ___o, (_new)); \
^
arch/mips/include/asm/cmpxchg.h:315:11: note: expanded from macro 'arch_cmpxchg64'
__res = __cmpxchg64_unsupported(); \
^
>> arch/mips/include/asm/cmpxchg.h:256:3: error: instruction requires a CPU feature not currently enabled
" dsra %M0, %L0, 32 \n"
^
<inline asm>:6:2: note: instantiated into assembly here
dsra $4, $2, 32
^
In file included from block/blk-core.c:16:
In file included from include/linux/module.h:13:
In file included from include/linux/stat.h:19:
In file included from include/linux/time.h:60:
In file included from include/linux/time32.h:13:
In file included from include/linux/timex.h:67:
In file included from arch/mips/include/asm/timex.h:19:
In file included from arch/mips/include/asm/cpu-type.h:12:
In file included from include/linux/smp.h:13:
In file included from include/linux/cpumask.h:13:
In file included from include/linux/atomic.h:7:
In file included from arch/mips/include/asm/atomic.h:23:
arch/mips/include/asm/cmpxchg.h:270:3: error: instruction requires a CPU feature not currently enabled
" dins %L1, %M5, 32, 32 \n"
^
<inline asm>:11:2: note: instantiated into assembly here
dins $5, $16, 32, 32
^
>> block/blk-core.c:936:14: error: call to '__cmpxchg64_unsupported' declared with 'error' attribute: cmpxchg64 not available; cpu_has_64bits may be false
if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
^
include/linux/atomic/atomic-instrumented.h:2016:2: note: expanded from macro 'try_cmpxchg64'
arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
^
include/linux/atomic/atomic-arch-fallback.h:161:9: note: expanded from macro 'arch_try_cmpxchg64'
___r = arch_cmpxchg64((_ptr), ___o, (_new)); \
^
arch/mips/include/asm/cmpxchg.h:315:11: note: expanded from macro 'arch_cmpxchg64'
__res = __cmpxchg64_unsupported(); \
^
>> arch/mips/include/asm/cmpxchg.h:256:3: error: instruction requires a CPU feature not currently enabled
" dsra %M0, %L0, 32 \n"
^
<inline asm>:6:2: note: instantiated into assembly here
dsra $4, $2, 32
^
In file included from block/blk-core.c:16:
In file included from include/linux/module.h:13:
In file included from include/linux/stat.h:19:
In file included from include/linux/time.h:60:
In file included from include/linux/time32.h:13:
In file included from include/linux/timex.h:67:
In file included from arch/mips/include/asm/timex.h:19:
In file included from arch/mips/include/asm/cpu-type.h:12:
In file included from include/linux/smp.h:13:
In file included from include/linux/cpumask.h:13:
In file included from include/linux/atomic.h:7:
In file included from arch/mips/include/asm/atomic.h:23:
arch/mips/include/asm/cmpxchg.h:270:3: error: instruction requires a CPU feature not currently enabled
" dins %L1, %M5, 32, 32 \n"
^
<inline asm>:11:2: note: instantiated into assembly here
dins $5, $8, 32, 32
^
9 errors generated.


vim +936 block/blk-core.c

929
930 void update_io_ticks(struct block_device *part, u64 now, bool end)
931 {
932 u64 stamp;
933 again:
934 stamp = READ_ONCE(part->bd_stamp);
935 if (unlikely(time_after64(now, stamp))) {
> 936 if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
937 __part_stat_add(part, io_ticks, end ? now - stamp : 1);
938 }
939 if (part->bd_partno) {
940 part = bdev_whole(part);
941 goto again;
942 }
943 }
944

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (8.77 kB)
config (75.98 kB)
Download all attachments

2022-12-21 17:20:17

by Keith Busch

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

On Wed, Dec 21, 2022 at 04:05:06AM +0000, Gulam Mohamed wrote:
> +u64 blk_get_iostat_ticks(struct request_queue *q)
> +{
> + return (blk_queue_precise_io_stat(q) ? ktime_get_ns() : jiffies);
> +}
> +EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);
> +
> void update_io_ticks(struct block_device *part, u64 now, bool end)
> {
> u64 stamp;
> @@ -968,20 +980,26 @@ EXPORT_SYMBOL(bdev_start_io_acct);
> u64 bio_start_io_acct(struct bio *bio)
> {
> return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
> - bio_op(bio), jiffies);
> + bio_op(bio),
> + blk_get_iostat_ticks(bio->bi_bdev->bd_queue));
> }
> EXPORT_SYMBOL_GPL(bio_start_io_acct);
>
> void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
> u64 start_time)
> {
> + u64 now;
> + u64 duration;
> + struct request_queue *q = bdev_get_queue(bdev);
> const int sgrp = op_stat_group(op);
> - u64 now = READ_ONCE(jiffies);
> - u64 duration = (unsigned long)now -(unsigned long) start_time;
> + now = blk_get_iostat_ticks(q);;

I don't think you can rely on the blk_queue_precise_io_stat() flag in
the completion side. The user can toggle this with data in flight, which
means the completion may use different tick units than the start. I
think you'll need to add a flag to the request in the start accounting
side to know which method to use for the completion.

> @@ -951,6 +951,7 @@ ssize_t part_stat_show(struct device *dev,
> struct request_queue *q = bdev_get_queue(bdev);
> struct disk_stats stat;
> unsigned int inflight;
> + u64 stat_ioticks;
>
> if (queue_is_mq(q))
> inflight = blk_mq_in_flight(q, bdev);
> @@ -959,10 +960,13 @@ ssize_t part_stat_show(struct device *dev,
>
> if (inflight) {
> part_stat_lock();
> - update_io_ticks(bdev, jiffies, true);
> + update_io_ticks(bdev, blk_get_iostat_ticks(q), true);
> part_stat_unlock();
> }
> part_stat_read_all(bdev, &stat);
> + stat_ioticks = (blk_queue_precise_io_stat(q) ?
> + div_u64(stat.io_ticks, NSEC_PER_MSEC) :
> + jiffies_to_msecs(stat.io_ticks));


With the user able to change the precision at will, I think these
io_ticks need to have a consistent unit size. Mixing jiffies and nsecs
is going to create garbage stats output. Could existing io_ticks using
jiffies be converted to jiffies_to_nsecs() so that you only have one
unit to consider?

2022-12-21 23:13:55

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Hi Gulam,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-2-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns
config: i386-randconfig-a014-20221219
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/afff703cb450ec3806aa1a27406d21fc4eaa1f43
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
git checkout afff703cb450ec3806aa1a27406d21fc4eaa1f43
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash drivers/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

>> drivers/md/dm.c:507:6: error: redefinition of 'start_time'
u64 start_time = (blk_queue_precise_io_stat(q) ?
^
drivers/md/dm.c:501:6: note: previous definition is here
u64 start_time = io->start_time;
^
1 error generated.


vim +/start_time +507 drivers/md/dm.c

497
498 static void dm_io_acct(struct dm_io *io, bool end)
499 {
500 struct dm_stats_aux *stats_aux = &io->stats_aux;
501 u64 start_time = io->start_time;
502 struct mapped_device *md = io->md;
503 struct bio *bio = io->orig_bio;
504 unsigned int sectors;
505 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
506
> 507 u64 start_time = (blk_queue_precise_io_stat(q) ?
508 nsecs_to_jiffies(io->start_time) :
509 io->start_time);
510
511 /*
512 * If REQ_PREFLUSH set, don't account payload, it will be
513 * submitted (and accounted) after this flush completes.
514 */
515 if (bio_is_flush_with_data(bio))
516 sectors = 0;
517 else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
518 sectors = bio_sectors(bio);
519 else
520 sectors = io->sectors;
521
522 if (!end)
523 bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
524 start_time);
525 else
526 bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
527
528 if (static_branch_unlikely(&stats_enabled) &&
529 unlikely(dm_stats_used(&md->stats))) {
530 sector_t sector;
531
532 if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
533 sector = bio->bi_iter.bi_sector;
534 else
535 sector = bio_end_sector(bio) - io->sector_offset;
536
537 dm_stats_account_io(&md->stats, bio_data_dir(bio),
538 sector, sectors,
539 end, start_time, stats_aux);
540 }
541 }
542

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (3.91 kB)
config (157.10 kB)
Download all attachments

2022-12-22 15:52:27

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting

Hi Gulam,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-1-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting
config: riscv-randconfig-r042-20221218
compiler: riscv32-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/254be08cd76a258e2d0489fac716308b416dfe6b
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
git checkout 254be08cd76a258e2d0489fac716308b416dfe6b
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=riscv SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

block/blk-core.c: Assembler messages:
>> block/blk-core.c:936: Error: unrecognized opcode `lr.d a6,0(a5)'
>> block/blk-core.c:938: Error: unrecognized opcode `sc.d.rl a3,s2,0(a5)'


vim +936 block/blk-core.c

929
930 void update_io_ticks(struct block_device *part, u64 now, bool end)
931 {
932 u64 stamp;
933 again:
934 stamp = READ_ONCE(part->bd_stamp);
935 if (unlikely(time_after64(now, stamp))) {
> 936 if (likely(try_cmpxchg64(&part->bd_stamp, &stamp, now)))
937 __part_stat_add(part, io_ticks, end ? now - stamp : 1);
> 938 }
939 if (part->bd_partno) {
940 part = bdev_whole(part);
941 goto again;
942 }
943 }
944

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (2.64 kB)
config (142.61 kB)
Download all attachments

2022-12-23 15:07:33

by Gulam Mohamed

[permalink] [raw]
Subject: RE: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Hi Keith,

Thanks for reviewing this request. Can you please see my inline comments?

Regards,
Gulam Mohamed.

-----Original Message-----
From: Keith Busch <[email protected]>
Sent: Wednesday, December 21, 2022 10:39 PM
To: Gulam Mohamed <[email protected]>
Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Junxiao Bi <[email protected]>; Martin Petersen <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; Konrad Wilk <[email protected]>; Joe Jin <[email protected]>; Rajesh Sivaramasubramaniom <[email protected]>; Shminderjit Singh <[email protected]>
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

On Wed, Dec 21, 2022 at 04:05:06AM +0000, Gulam Mohamed wrote:
> +u64 blk_get_iostat_ticks(struct request_queue *q) {
> + return (blk_queue_precise_io_stat(q) ? ktime_get_ns() :
> +jiffies); } EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);
> +
> void update_io_ticks(struct block_device *part, u64 now, bool end) {
> u64 stamp;
> @@ -968,20 +980,26 @@ EXPORT_SYMBOL(bdev_start_io_acct);
> u64 bio_start_io_acct(struct bio *bio) {
> return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
> - bio_op(bio), jiffies);
> + bio_op(bio),
> + blk_get_iostat_ticks(bio->bi_bdev->bd_queue));
> }
> EXPORT_SYMBOL_GPL(bio_start_io_acct);
>
> void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
> u64 start_time)
> {
> + u64 now;
> + u64 duration;
> + struct request_queue *q = bdev_get_queue(bdev);
> const int sgrp = op_stat_group(op);
> - u64 now = READ_ONCE(jiffies);
> - u64 duration = (unsigned long)now -(unsigned long) start_time;
> + now = blk_get_iostat_ticks(q);;

I don't think you can rely on the blk_queue_precise_io_stat() flag in the completion side. The user can toggle this with data in flight, which means the completion may use different tick units than the start. I think you'll need to add a flag to the request in the start accounting side to know which method to use for the completion.

[GULAM]: As per my understanding, this may work for a single request_queue implemetation. But this request based accounting, as per my understanding, may be an issue with the Multi-QUEUE as there is a separate queue for each CPU and the time-stamp being recorded for the block device is a global one. Also, the issue you mentioned about the start and end accounting may update the ticks in different
units for the inflight IOs, may be just for a while. So, even if it works for MQ, I am trying to understand how much is it feasible to do this request-based change for an issue which may be there for just a moment?
So, can you please correct me if I am wrong and explore more on your suggestion so that I can understand properly?

> @@ -951,6 +951,7 @@ ssize_t part_stat_show(struct device *dev,
> struct request_queue *q = bdev_get_queue(bdev);
> struct disk_stats stat;
> unsigned int inflight;
> + u64 stat_ioticks;
>
> if (queue_is_mq(q))
> inflight = blk_mq_in_flight(q, bdev); @@ -959,10 +960,13 @@ ssize_t
> part_stat_show(struct device *dev,
>
> if (inflight) {
> part_stat_lock();
> - update_io_ticks(bdev, jiffies, true);
> + update_io_ticks(bdev, blk_get_iostat_ticks(q), true);
> part_stat_unlock();
> }
> part_stat_read_all(bdev, &stat);
> + stat_ioticks = (blk_queue_precise_io_stat(q) ?
> + div_u64(stat.io_ticks, NSEC_PER_MSEC) :
> + jiffies_to_msecs(stat.io_ticks));


With the user able to change the precision at will, I think these io_ticks need to have a consistent unit size. Mixing jiffies and nsecs is going to create garbage stats output. Could existing io_ticks using jiffies be converted to jiffies_to_nsecs() so that you only have one unit to consider?
[GULAM]: I am not sure if this will work as we just multiply with 1000000 to convert jiffies to nano-seconds.



2022-12-24 02:38:33

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

Hi Gulam,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on song-md/md-next linus/master next-20221220]
[cannot apply to device-mapper-dm/for-next v6.1]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Gulam-Mohamed/block-Data-type-conversion-for-IO-accounting/20221221-121052
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20221221040506.1174644-2-gulam.mohamed%40oracle.com
patch subject: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns
config: loongarch-randconfig-c44-20221218
compiler: loongarch64-linux-gcc (GCC) 12.1.0

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

cocci warnings: (new ones prefixed by >>)
>> block/blk-core.c:995:31-32: Unneeded semicolon

--
0-DAY CI Kernel Test Service
https://01.org/lkp


Attachments:
(No filename) (1.20 kB)
config (189.66 kB)
Download all attachments

2022-12-25 11:51:34

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 1/2] block: Data type conversion for IO accounting



On 12/21/22 06:05, Gulam Mohamed wrote:
> Change the data type of start and end time IO accounting variables in,
> block layer, from "unsigned long" to "u64". This is to enable nano-seconds
> granularity, in next commit, for the devices whose latency is less than
> milliseconds.
>
> Changes from V2 to V3
> =====================
> 1. Changed all the required variables data-type to u64 as part of this
> first patch
> 2. Create a new setting '2' for iostats in sysfs in next patch
> 3. Change the code to get the ktime values when iostat=2 in next patch
>
> Signed-off-by: Gulam Mohamed <[email protected]>
> ---
> block/blk-core.c | 24 ++++++++++++------------
> block/blk.h | 2 +-
> drivers/block/drbd/drbd_int.h | 2 +-
> drivers/block/zram/zram_drv.c | 4 ++--
> drivers/md/bcache/request.c | 10 +++++-----
> drivers/md/dm-core.h | 2 +-
> drivers/md/dm.c | 2 +-
> drivers/md/md.h | 2 +-
> drivers/md/raid1.h | 2 +-
> drivers/md/raid10.h | 2 +-
> drivers/md/raid5.c | 2 +-
> drivers/nvdimm/btt.c | 2 +-
> drivers/nvdimm/pmem.c | 2 +-
> include/linux/blk_types.h | 2 +-
> include/linux/blkdev.h | 12 ++++++------
> include/linux/part_stat.h | 2 +-

nvme-mpath now also has stats, so struct nvme_request should also be
updated.

2023-01-17 19:42:41

by Mike Snitzer

[permalink] [raw]
Subject: Re: [PATCH for-6.2/block V3 2/2] block: Change the granularity of io ticks from ms to ns

[comments/questions inlined below]

On Tue, Dec 20 2022 at 11:05P -0500,
Gulam Mohamed <[email protected]> wrote:

> Problem Desc
> ============
> The "iostat" user-space utility was showing %util as 100% for the disks
> which has latencies less than a milli-second i.e for latencies in the
> range of micro-seconds and below.
>
> Root Cause
> ==========
> The IO accounting in block layer is currently done by updating the
> io_ticks in jiffies which is of milli-seconds granularity. Due to this,
> for the devices with IO latencies less than a milli-second, the latency
> will be accounted as 1 milli-second even-though its in the range of
> micro-seconds. This was causing the iostat command to show %util
> as 100% which is incorrect.
>
> Recreationg of the issue
> ========================
> Setup
> -----
> Devices: NVMe 24 devices
> Model number: 4610 (Intel)
>
> fio
> ---
> [global]
> bs=4K
> iodepth=1
> direct=1
> ioengine=libaio
> group_reporting
> time_based
> runtime=100
> thinktime=1ms
> numjobs=1
> name=raw-write
> rw=randrw
> ignore_error=EIO:EIO
> [job1]
> filename=/dev/nvme0n1
>
> iostat o/p
> ----------
>
> Device %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
> nvme3n1 0.00 0.05 0.00 75.38 0.50 0.00 0.00 100.00
>
> Device %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
> nvme3n1 0.00 0.05 0.00 74.74 0.50 0.00 0.00 100.00
>
> Solution
> ========
> Use ktime_get_ns() to update the disk_stats io_ticks so that the io_ticks
> are updated for every io start and end times.
>
> Issues using ktime
> ==================
>
> Using ktime_get_ns() has a performance overhead as ktime will go ahead
> and reads the clock everytime its called. Following test environment
> was used by Jens Axboe on which t/io_uring was run which has shown a
> high performance drop.
>
> Devices
> -------
> SSDs: P5800X
> No of devices: 24
>
> io_uring config
> ---------------
> Polled IO
> iostats: Enabled
> Reads: Random
> Block Size: 512
> QDepth: 128
> Batch Submit: 32
> Batch Complete: 32
> No of Threads: 24
>
> With the above environment and ktime patch, it has shown a performance
> drop of ~25% from iostats disabled and ~19% performance drop from current
> iostats enabled. This performance drop is high.
>
> Suggestion from Jens Axboe
> ==========================
> Jens Axboe suggested to split the bigger patch into two as follows:
>
> 1. In first patch, change all the types from unsigned long to u64, that
> can be done while retaining jiffies.
>
> 2. In second patch, add an iostats == 2 setting, which enables the higher
> resolution mode using ktime. We'd still default to 1, lower granularity
> iostats enabled.
>
> Fix details
> ===========
> 1. Use ktime_get_ns() to get the current nano-seconds to update the
> io_ticks for start and end time stamps in block layer for io accounting
>
> 2. Create a new setting '2' in sysfs for iostats variable i.e for
> /sys/block/<device-name>/queue/iostats, to enable the iostat (io
> accounting) with nano-seconds (using ktime) granularity. This setting
> should be enabled only if the iostat is needed with high resolution
> mode as it has a high performance drop
>
> 3. Earlier available settings were 0 and 1 for disable and enable io
> accounting with milli-seconds granularity (jiffies)
>
> Testing
> =======
> Ran the t/io_uring command with following setup:
>
> Devices
> -------
> SSDs: P4610
> No of devices: 8
>
> io_uring config
> ---------------
> Polled IO
> iostats: Enabled
> Reads: Random
> Block Size: 512
> QDepth: 128
> Batch Submit: 32
> Batch Complete: 32
> No of Threads: 24
>
> io_uring o/p
> ------------
> iostat=0, with patch: Maximum IOPS=10.09M
> iostat=1, with patch: Maximum IOPS=9.84M
> iostat=2, with patch: Maximum IOPS=9.48M
>
> Changes from V2 to V3
> =====================
> 1. Changed all the required variables data-type to u64 as a first patch
> 2. Create a new setting '2' for iostats in sysfs in this patch
> 3. Change the code to get the ktime values when iostat=2, in this patch
>
> Signed-off-by: Gulam Mohamed <[email protected]>
> ---
> block/blk-core.c | 26 +++++++++++++++++----
> block/blk-mq.c | 4 ++--
> block/blk-sysfs.c | 39 ++++++++++++++++++++++++++++++-
> block/genhd.c | 18 ++++++++++----
> drivers/block/drbd/drbd_debugfs.c | 12 ++++++++--
> drivers/block/zram/zram_drv.c | 3 ++-
> drivers/md/dm.c | 13 +++++++++--
> include/linux/blkdev.h | 4 ++++
> 8 files changed, 103 insertions(+), 16 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 5670032fe932..0b5e4eb909a5 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -927,6 +927,18 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
> }
> EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
>
> +/*
> + * Get the time based upon the available granularity for io accounting
> + * If the resolution mode is set to precise (2) i.e
> + * (/sys/block/<device>/queue/iostats = 2), then this will return time
> + * in nano-seconds else this will return time in jiffies
> + */
> +u64 blk_get_iostat_ticks(struct request_queue *q)
> +{
> + return (blk_queue_precise_io_stat(q) ? ktime_get_ns() : jiffies);
> +}
> +EXPORT_SYMBOL_GPL(blk_get_iostat_ticks);
> +

Would prefer to see blk_get_iostat_ticks tagged with 'static inline'
and moved to blkdev.h (obviously below blk_queue_precise_io_stat).

Also, just a general comment: I've historically been a bigger (ab)user
of jump_labels to avoid excess branching costs per IO, e.g. see commit
442761fd2b29 ("dm: conditionally enable branching for less used features").
It could be there is an opportunity for a follow-on patch that
leverages them? Or should that just be left to each driver to decide
to do with its own resources (rather than have block core provide that)?

(if nothing in the system ever uses QUEUE_FLAG_PRECISE_IO_STAT then
it'd be nice to avoid needless branching).

<snip>

> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 011a85ea40da..1bb58a0b8cd1 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -482,7 +482,11 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
>
> u64 dm_start_time_ns_from_clone(struct bio *bio)
> {
> - return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
> + struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> + u64 start_time_ns = (blk_queue_precise_io_stat(q) ?
> + clone_to_tio(bio)->io->start_time :
> + jiffies_to_nsecs(clone_to_tio(bio)->io->start_time));
> + return start_time_ns;
> }
> EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
>
> @@ -498,6 +502,11 @@ static void dm_io_acct(struct dm_io *io, bool end)
> struct mapped_device *md = io->md;
> struct bio *bio = io->orig_bio;
> unsigned int sectors;
> + struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +
> + u64 start_time = (blk_queue_precise_io_stat(q) ?
> + nsecs_to_jiffies(io->start_time) :
> + io->start_time);
>
> /*
> * If REQ_PREFLUSH set, don't account payload, it will be
> @@ -589,7 +598,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
> io->orig_bio = bio;
> io->md = md;
> spin_lock_init(&io->lock);
> - io->start_time = jiffies;
> + io->start_time = blk_get_iostat_ticks(bio->bi_bdev->bd_queue);
> io->flags = 0;
>
> if (static_branch_unlikely(&stats_enabled))

The above seems correct, by checking the top-level DM device's
disposition on QUEUE_FLAG_PRECISE_IO_STAT, but I'm now wondering
if there should be code that ensures consistency across stacked
devices (which DM is the biggest creator/consumer of)?

dm_table_set_restrictions() deals with such inconsistencies at DM
device creation time (so basically: if any device in the DM table has
QUEUE_FLAG_PRECISE_IO_STAT set then the top-level DM device should
inherit that queue flag). A user could still override it but at least
the default will be sane initially.

Mike