Hi Jens,
they series contains various improvement for block I/O accounting. The
first bunch of patches switch the bio based drivers to better accounting
helpers compared to the current mess. The end contains a fix and various
performanc improvements. Most of this comes from a series Konstantin
sent a few weeks ago, rebased on changes that landed in your tree since
and my change to always use the percpu version of the disk stats.
Switch bcache to use the nicer bio accounting helpers, and call the
routines where we also sample the start time to give coherent accounting
results.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/md/bcache/request.c | 18 ++++--------------
1 file changed, 4 insertions(+), 14 deletions(-)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 77d1a26975174..22b483527176b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -668,9 +668,7 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
- &s->d->disk->part0, s->start_time);
-
+ bio_end_io_acct(s->orig_bio, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
@@ -730,7 +728,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
- s->start_time = jiffies;
+ s->start_time = bio_start_io_acct(bio);
s->iop.c = d->c;
s->iop.bio = NULL;
@@ -1082,8 +1080,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
- &ddip->d->disk->part0, ddip->start_time);
+ bio_end_io_acct(bio, ddip->start_time);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@@ -1108,7 +1105,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
- ddip->start_time = jiffies;
+ ddip->start_time = bio_start_io_acct(bio);
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;
@@ -1190,11 +1187,6 @@ blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio)
}
}
- generic_start_io_acct(q,
- bio_op(bio),
- bio_sectors(bio),
- &d->disk->part0);
-
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1311,8 +1303,6 @@ blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
- generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
-
s = search_alloc(bio, d);
cl = &s->cl;
bio = &s->bio.bio;
--
2.26.2
Switch dm to use the nicer bio accounting helpers.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/md/dm.c | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f215b86664484..3f39fa1ac756e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -681,11 +681,7 @@ static void start_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- io->start_time = jiffies;
-
- generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
- &dm_disk(md)->part0);
-
+ io->start_time = bio_start_io_acct(bio);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -698,8 +694,7 @@ static void end_io_acct(struct dm_io *io)
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
- io->start_time);
+ bio_end_io_acct(bio, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
--
2.26.2
Switch dm to use the nicer bio accounting helpers.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/nvdimm/blk.c | 6 ++++--
drivers/nvdimm/btt.c | 6 ++++--
drivers/nvdimm/nd.h | 19 -------------------
drivers/nvdimm/pmem.c | 6 ++++--
4 files changed, 12 insertions(+), 25 deletions(-)
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 43751fab9d36a..036e23aef9b04 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -178,7 +178,9 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
bip = bio_integrity(bio);
nsblk = q->queuedata;
rw = bio_data_dir(bio);
- do_acct = nd_iostat_start(bio, &start);
+ do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ if (do_acct)
+ start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
@@ -195,7 +197,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
}
}
if (do_acct)
- nd_iostat_end(bio, start);
+ bio_end_io_acct(bio, start);
bio_endio(bio);
return BLK_QC_T_NONE;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 3b09419218d6f..90c0c4bbe77b9 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1452,7 +1452,9 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
- do_acct = nd_iostat_start(bio, &start);
+ do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ if (do_acct)
+ start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
@@ -1477,7 +1479,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
}
}
if (do_acct)
- nd_iostat_end(bio, start);
+ bio_end_io_acct(bio, start);
bio_endio(bio);
return BLK_QC_T_NONE;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 85dbb2a322b9b..85c1ae813ea31 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,25 +396,6 @@ static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
#endif
int nd_blk_region_init(struct nd_region *nd_region);
int nd_region_activate(struct nd_region *nd_region);
-void __nd_iostat_start(struct bio *bio, unsigned long *start);
-static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
-{
- struct gendisk *disk = bio->bi_disk;
-
- if (!blk_queue_io_stat(disk->queue))
- return false;
-
- *start = jiffies;
- generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
- &disk->part0);
- return true;
-}
-static inline void nd_iostat_end(struct bio *bio, unsigned long start)
-{
- struct gendisk *disk = bio->bi_disk;
-
- generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
-}
static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
unsigned int len)
{
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 2df6994acf836..97f948f8f4e62 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -202,7 +202,9 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
if (bio->bi_opf & REQ_PREFLUSH)
ret = nvdimm_flush(nd_region, bio);
- do_acct = nd_iostat_start(bio, &start);
+ do_acct = blk_queue_io_stat(bio->bi_disk->queue);
+ if (do_acct)
+ start = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
if (op_is_write(bio_op(bio)))
rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset,
@@ -216,7 +218,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
}
}
if (do_acct)
- nd_iostat_end(bio, start);
+ bio_end_io_acct(bio, start);
if (bio->bi_opf & REQ_FUA)
ret = nvdimm_flush(nd_region, bio);
--
2.26.2
All callers are in blk-core.c, so move update_io_ticks over.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/bio.c | 16 ----------------
block/blk-core.c | 15 +++++++++++++++
block/blk.h | 1 -
3 files changed, 15 insertions(+), 17 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 3e89c7b37855a..5235da6434aab 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1376,22 +1376,6 @@ void bio_check_pages_dirty(struct bio *bio)
schedule_work(&bio_dirty_work);
}
-void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
-{
- unsigned long stamp;
-again:
- stamp = READ_ONCE(part->stamp);
- if (unlikely(stamp != now)) {
- if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
- }
- if (part->partno) {
- part = &part_to_disk(part)->part0;
- goto again;
- }
-}
-
static inline bool bio_remaining_done(struct bio *bio)
{
/*
diff --git a/block/blk-core.c b/block/blk-core.c
index 8973104f88d90..c1675d43c2da0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1381,6 +1381,21 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
+{
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
+
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
diff --git a/block/blk.h b/block/blk.h
index 5db4ec1e85f7b..bdf5e94467aa2 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -344,7 +344,6 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q);
static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
#endif
-void update_io_ticks(struct hd_struct *part, unsigned long now, bool end);
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
--
2.26.2
From: Konstantin Khlebnikov <[email protected]>
Move the non-"new_io" branch of blk_account_io_start() into separate
function. Fix merge accounting for discards (they were counted as write
merges).
The new blk_account_io_merge_bio() doesn't call update_io_ticks() unlike
blk_account_io_start(), as there is no reason for that.
Signed-off-by: Konstantin Khlebnikov <[email protected]>
[hch: rebased]
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-core.c | 25 ++++++++++++++++---------
block/blk-exec.c | 2 +-
block/blk-mq.c | 2 +-
block/blk.h | 2 +-
4 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index c1675d43c2da0..bf2f7d4bc0c1c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -636,6 +636,16 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
bool bio_attempt_back_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
@@ -656,7 +666,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio,
bio_crypt_free_ctx(bio);
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
}
@@ -682,7 +692,7 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio,
bio_crypt_do_front_merge(req, bio);
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
}
@@ -704,7 +714,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
@@ -1329,7 +1339,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
return BLK_STS_IOERR;
if (blk_queue_io_stat(q))
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
@@ -1433,16 +1443,13 @@ void blk_account_io_done(struct request *req, u64 now)
}
}
-void blk_account_io_start(struct request *rq, bool new_io)
+void blk_account_io_start(struct request *rq)
{
if (!blk_do_io_stat(rq))
return;
part_stat_lock();
- if (!new_io)
- part_stat_inc(rq->part, merges[rq_data_dir(rq)]);
- else
- rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
update_io_ticks(rq->part, jiffies, false);
part_stat_unlock();
}
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e20a852ae432d..85324d53d072f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,7 +55,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* don't check dying flag for MQ because the request won't
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cac11945f6023..c606c74463ccd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1822,7 +1822,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
blk_rq_bio_prep(rq, bio, nr_segs);
blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk.h b/block/blk.h
index 0ecba2ab383d6..428f7e5d70a86 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -185,7 +185,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq);
-void blk_account_io_start(struct request *req, bool new_io);
+void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
/*
--
2.26.2
From: Konstantin Khlebnikov <[email protected]>
The RCU lock is required only in disk_map_sector_rcu() to lookup the
partition. After that request holds reference to related hd_struct.
Replace get_cpu() with preempt_disable() - returned cpu index is unused.
Signed-off-by: Konstantin Khlebnikov <[email protected]>
[hch: rebased]
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/genhd.c | 11 ++++++++---
include/linux/part_stat.h | 4 ++--
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index 3e7df0a3e6bb0..1a76593276644 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -321,11 +321,12 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
struct hd_struct *part;
int i;
+ rcu_read_lock();
ptbl = rcu_dereference(disk->part_tbl);
part = rcu_dereference(ptbl->last_lookup);
if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
- return part;
+ goto out_unlock;
for (i = 1; i < ptbl->len; i++) {
part = rcu_dereference(ptbl->part[i]);
@@ -339,10 +340,14 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
if (!hd_struct_try_get(part))
break;
rcu_assign_pointer(ptbl->last_lookup, part);
- return part;
+ goto out_unlock;
}
}
- return &disk->part0;
+
+ part = &disk->part0;
+out_unlock:
+ rcu_read_unlock();
+ return part;
}
/**
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 6644197980b92..a6b0938ce82e9 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -21,8 +21,8 @@ struct disk_stats {
*
* part_stat_read() can be called at any time.
*/
-#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
-#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
+#define part_stat_lock() preempt_disable()
+#define part_stat_unlock() preempt_enable()
#define part_stat_get_cpu(part, field, cpu) \
(per_cpu_ptr((part)->dkstats, (cpu))->field)
--
2.26.2
We only need the stats lock (aka preempt_disable()) for updating the
states, not for looking up or dropping the hd_struct reference.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-core.c | 5 +++--
block/blk-merge.c | 3 ++-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index bf2f7d4bc0c1c..a01fb2b508f0e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1437,9 +1437,9 @@ void blk_account_io_done(struct request *req, u64 now)
update_io_ticks(part, jiffies, true);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_unlock();
hd_struct_put(part);
- part_stat_unlock();
}
}
@@ -1448,8 +1448,9 @@ void blk_account_io_start(struct request *rq)
if (!blk_do_io_stat(rq))
return;
- part_stat_lock();
rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
+ part_stat_lock();
update_io_ticks(rq->part, jiffies, false);
part_stat_unlock();
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c3beae5c1be71..f0b0bae075a0c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -674,8 +674,9 @@ static void blk_account_io_merge_request(struct request *req)
if (blk_do_io_stat(req)) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- hd_struct_put(req->part);
part_stat_unlock();
+
+ hd_struct_put(req->part);
}
}
--
2.26.2
Switch zram to use the nicer bio accounting helpers, and as part of that
ensure each bio is counted as a single I/O request.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/block/zram/zram_drv.c | 24 ++++++++++--------------
1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ebb234f36909c..6e2ad90b17a37 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1510,13 +1510,8 @@ static void zram_bio_discard(struct zram *zram, u32 index,
static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset, unsigned int op, struct bio *bio)
{
- unsigned long start_time = jiffies;
- struct request_queue *q = zram->disk->queue;
int ret;
- generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
- &zram->disk->part0);
-
if (!op_is_write(op)) {
atomic64_inc(&zram->stats.num_reads);
ret = zram_bvec_read(zram, bvec, index, offset, bio);
@@ -1526,8 +1521,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
ret = zram_bvec_write(zram, bvec, index, offset, bio);
}
- generic_end_io_acct(q, op, &zram->disk->part0, start_time);
-
zram_slot_lock(zram, index);
zram_accessed(zram, index);
zram_slot_unlock(zram, index);
@@ -1548,6 +1541,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
u32 index;
struct bio_vec bvec;
struct bvec_iter iter;
+ unsigned long start_time;
index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
offset = (bio->bi_iter.bi_sector &
@@ -1563,6 +1557,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
break;
}
+ start_time = bio_start_io_acct(bio);
bio_for_each_segment(bvec, bio, iter) {
struct bio_vec bv = bvec;
unsigned int unwritten = bvec.bv_len;
@@ -1571,8 +1566,10 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
unwritten);
if (zram_bvec_rw(zram, &bv, index, offset,
- bio_op(bio), bio) < 0)
- goto out;
+ bio_op(bio), bio) < 0) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
bv.bv_offset += bv.bv_len;
unwritten -= bv.bv_len;
@@ -1580,12 +1577,8 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
update_position(&index, &offset, &bv);
} while (unwritten);
}
-
+ bio_end_io_acct(bio, start_time);
bio_endio(bio);
- return;
-
-out:
- bio_io_error(bio);
}
/*
@@ -1633,6 +1626,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
u32 index;
struct zram *zram;
struct bio_vec bv;
+ unsigned long start_time;
if (PageTransHuge(page))
return -ENOTSUPP;
@@ -1651,7 +1645,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
+ start_time = disk_start_io_acct(bdev->bd_disk, SECTORS_PER_PAGE, op);
ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
+ disk_end_io_acct(bdev->bd_disk, op, start_time);
out:
/*
* If I/O fails, just return error(ie, non-zero) without
--
2.26.2
percpu variables have a perfectly fine working stub implementation
for UP kernels, so use that.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk.h | 2 +-
block/genhd.c | 12 +++------
block/partitions/core.c | 5 ++--
include/linux/genhd.h | 13 ---------
include/linux/part_stat.h | 55 ++++++++-------------------------------
5 files changed, 18 insertions(+), 69 deletions(-)
diff --git a/block/blk.h b/block/blk.h
index bdf5e94467aa2..0ecba2ab383d6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -378,7 +378,7 @@ static inline void hd_struct_put(struct hd_struct *part)
static inline void hd_free_part(struct hd_struct *part)
{
- free_part_stats(part);
+ free_percpu(part->dkstats);
kfree(part->info);
percpu_ref_exit(&part->ref);
}
diff --git a/block/genhd.c b/block/genhd.c
index 094ed90964964..3e7df0a3e6bb0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -92,7 +92,6 @@ const char *bdevname(struct block_device *bdev, char *buf)
}
EXPORT_SYMBOL(bdevname);
-#ifdef CONFIG_SMP
static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
{
int cpu;
@@ -112,12 +111,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
stat->io_ticks += ptr->io_ticks;
}
}
-#else /* CONFIG_SMP */
-static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
-{
- memcpy(stat, &part->dkstats, sizeof(struct disk_stats));
-}
-#endif /* CONFIG_SMP */
static unsigned int part_in_flight(struct request_queue *q,
struct hd_struct *part)
@@ -1688,14 +1681,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) {
- if (!init_part_stats(&disk->part0)) {
+ disk->part0.dkstats = alloc_percpu(struct disk_stats);
+ if (!disk->part0.dkstats) {
kfree(disk);
return NULL;
}
init_rwsem(&disk->lookup_sem);
disk->node_id = node_id;
if (disk_expand_part_tbl(disk, 0)) {
- free_part_stats(&disk->part0);
+ free_percpu(disk->part0.dkstats);
kfree(disk);
return NULL;
}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 297004fd22648..78951e33b2d7c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -387,7 +387,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
if (!p)
return ERR_PTR(-EBUSY);
- if (!init_part_stats(p)) {
+ p->dkstats = alloc_percpu(struct disk_stats);
+ if (!p->dkstats) {
err = -ENOMEM;
goto out_free;
}
@@ -468,7 +469,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
out_free_info:
kfree(p->info);
out_free_stats:
- free_part_stats(p);
+ free_percpu(p->dkstats);
out_free:
kfree(p);
return ERR_PTR(err);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a9384449465a3..f0d6d77309a54 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -39,15 +39,6 @@ extern struct class block_class;
#include <linux/fs.h>
#include <linux/workqueue.h>
-struct disk_stats {
- u64 nsecs[NR_STAT_GROUPS];
- unsigned long sectors[NR_STAT_GROUPS];
- unsigned long ios[NR_STAT_GROUPS];
- unsigned long merges[NR_STAT_GROUPS];
- unsigned long io_ticks;
- local_t in_flight[2];
-};
-
#define PARTITION_META_INFO_VOLNAMELTH 64
/*
* Enough for the string representation of any kind of UUID plus NULL.
@@ -72,11 +63,7 @@ struct hd_struct {
seqcount_t nr_sects_seq;
#endif
unsigned long stamp;
-#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
-#else
- struct disk_stats dkstats;
-#endif
struct percpu_ref ref;
sector_t alignment_offset;
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index ece607607a864..6644197980b92 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -4,19 +4,23 @@
#include <linux/genhd.h>
+struct disk_stats {
+ u64 nsecs[NR_STAT_GROUPS];
+ unsigned long sectors[NR_STAT_GROUPS];
+ unsigned long ios[NR_STAT_GROUPS];
+ unsigned long merges[NR_STAT_GROUPS];
+ unsigned long io_ticks;
+ local_t in_flight[2];
+};
+
/*
* Macros to operate on percpu disk statistics:
*
- * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters
- * and should be called between disk_stat_lock() and
- * disk_stat_unlock().
+ * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should
+ * be called between disk_stat_lock() and disk_stat_unlock().
*
* part_stat_read() can be called at any time.
- *
- * part_stat_{add|set_all}() and {init|free}_part_stats are for
- * internal use only.
*/
-#ifdef CONFIG_SMP
#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
@@ -44,43 +48,6 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
sizeof(struct disk_stats));
}
-static inline int init_part_stats(struct hd_struct *part)
-{
- part->dkstats = alloc_percpu(struct disk_stats);
- if (!part->dkstats)
- return 0;
- return 1;
-}
-
-static inline void free_part_stats(struct hd_struct *part)
-{
- free_percpu(part->dkstats);
-}
-
-#else /* !CONFIG_SMP */
-#define part_stat_lock() ({ rcu_read_lock(); 0; })
-#define part_stat_unlock() rcu_read_unlock()
-
-#define part_stat_get(part, field) ((part)->dkstats.field)
-#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field)
-#define part_stat_read(part, field) part_stat_get(part, field)
-
-static inline void part_stat_set_all(struct hd_struct *part, int value)
-{
- memset(&part->dkstats, value, sizeof(struct disk_stats));
-}
-
-static inline int init_part_stats(struct hd_struct *part)
-{
- return 1;
-}
-
-static inline void free_part_stats(struct hd_struct *part)
-{
-}
-
-#endif /* CONFIG_SMP */
-
#define part_stat_read_accum(part, field) \
(part_stat_read(part, field[STAT_READ]) + \
part_stat_read(part, field[STAT_WRITE]) + \
--
2.26.2
Switch rsxx to use the nicer bio accounting helpers.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/block/rsxx/dev.c | 19 ++-----------------
1 file changed, 2 insertions(+), 17 deletions(-)
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 8ffa8260dcafe..3ba07ab30c84f 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -96,20 +96,6 @@ static const struct block_device_operations rsxx_fops = {
.ioctl = rsxx_blkdev_ioctl,
};
-static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
-{
- generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
- &card->gendisk->part0);
-}
-
-static void disk_stats_complete(struct rsxx_cardinfo *card,
- struct bio *bio,
- unsigned long start_time)
-{
- generic_end_io_acct(card->queue, bio_op(bio),
- &card->gendisk->part0, start_time);
-}
-
static void bio_dma_done_cb(struct rsxx_cardinfo *card,
void *cb_data,
unsigned int error)
@@ -121,7 +107,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
if (atomic_dec_and_test(&meta->pending_dmas)) {
if (!card->eeh_state && card->gendisk)
- disk_stats_complete(card, meta->bio, meta->start_time);
+ bio_end_io_acct(meta->bio, meta->start_time);
if (atomic_read(&meta->error))
bio_io_error(meta->bio);
@@ -167,10 +153,9 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
bio_meta->bio = bio;
atomic_set(&bio_meta->error, 0);
atomic_set(&bio_meta->pending_dmas, 0);
- bio_meta->start_time = jiffies;
if (!unlikely(card->halt))
- disk_stats_start(card, bio);
+ bio_meta->start_time = bio_start_io_acct(bio);
dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
bio_data_dir(bio) ? 'W' : 'R', bio_meta,
--
2.26.2
On 25/05/2020 14.29, Christoph Hellwig wrote:
> Hi Jens,
>
> they series contains various improvement for block I/O accounting. The
> first bunch of patches switch the bio based drivers to better accounting
> helpers compared to the current mess. The end contains a fix and various
> performanc improvements. Most of this comes from a series Konstantin
> sent a few weeks ago, rebased on changes that landed in your tree since
> and my change to always use the percpu version of the disk stats.
>
Thanks for picking this up.
One note about possible further improvement in reply to first patch.
Reviewed-by: Konstantin Khlebnikov <[email protected]>
Remove these now unused functions.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/bio.c | 39 ---------------------------------------
include/linux/bio.h | 6 ------
2 files changed, 45 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 9c101a0572ca2..3e89c7b37855a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1392,45 +1392,6 @@ void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
}
}
-void generic_start_io_acct(struct request_queue *q, int op,
- unsigned long sectors, struct hd_struct *part)
-{
- const int sgrp = op_stat_group(op);
- int rw = op_is_write(op);
-
- part_stat_lock();
-
- update_io_ticks(part, jiffies, false);
- part_stat_inc(part, ios[sgrp]);
- part_stat_add(part, sectors[sgrp], sectors);
- part_stat_local_inc(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_start_io_acct);
-
-void generic_end_io_acct(struct request_queue *q, int req_op,
- struct hd_struct *part, unsigned long start_time)
-{
- unsigned long now = jiffies;
- unsigned long duration = now - start_time;
- const int sgrp = op_stat_group(req_op);
- int rw = op_is_write(req_op);
-
- part_stat_lock();
-
- update_io_ticks(part, now, true);
- part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_stat_local_dec(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_end_io_acct);
-
static inline bool bio_remaining_done(struct bio *bio)
{
/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 950c9dc44c4f2..941378ec5b39f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -444,12 +444,6 @@ void bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
-void generic_start_io_acct(struct request_queue *q, int op,
- unsigned long sectors, struct hd_struct *part);
-void generic_end_io_acct(struct request_queue *q, int op,
- struct hd_struct *part,
- unsigned long start_time);
-
extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
--
2.26.2
Switch rsxx to use the nicer bio accounting helpers.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/lightnvm/pblk-cache.c | 8 +++-----
drivers/lightnvm/pblk-read.c | 11 ++++-------
2 files changed, 7 insertions(+), 12 deletions(-)
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 5c1034c22197c..f185f1a000083 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -21,16 +21,14 @@
void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
unsigned long flags)
{
- struct request_queue *q = pblk->dev->q;
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
- unsigned long start_time = jiffies;
+ unsigned long start_time;
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
- generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
- &pblk->disk->part0);
+ start_time = bio_start_io_acct(bio);
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
@@ -79,7 +77,7 @@ void pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
pblk_rl_inserted(&pblk->rl, nr_entries);
out:
- generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
+ bio_end_io_acct(bio, start_time);
pblk_write_should_kick(pblk);
if (ret == NVM_IO_DONE)
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 8efd14e683dc4..140927ebf41e9 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -187,12 +187,11 @@ static void pblk_end_user_read(struct bio *bio, int error)
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
bool put_line)
{
- struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *int_bio = rqd->bio;
unsigned long start_time = r_ctx->start_time;
- generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
+ bio_end_io_acct(int_bio, start_time);
if (rqd->error)
pblk_log_read_err(pblk, rqd);
@@ -263,17 +262,15 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
void pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
- struct nvm_tgt_dev *dev = pblk->dev;
- struct request_queue *q = dev->q;
sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
bool from_cache;
struct pblk_g_ctx *r_ctx;
struct nvm_rq *rqd;
struct bio *int_bio, *split_bio;
+ unsigned long start_time;
- generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
- &pblk->disk->part0);
+ start_time = bio_start_io_acct(bio);
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@@ -283,7 +280,7 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd);
- r_ctx->start_time = jiffies;
+ r_ctx->start_time = start_time;
r_ctx->lba = blba;
if (pblk_alloc_rqd_meta(pblk, rqd)) {
--
2.26.2
From: Konstantin Khlebnikov <[email protected]>
Also rename blk_account_io_merge() into blk_account_io_merge_request() to
distinguish it from merging request and bio.
Signed-off-by: Konstantin Khlebnikov <[email protected]>
[hch: rebased]
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-merge.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6a4538d39efd2..c3beae5c1be71 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -669,18 +669,16 @@ void blk_rq_set_mixed_merge(struct request *rq)
rq->rq_flags |= RQF_MIXED_MERGE;
}
-static void blk_account_io_merge(struct request *req)
+static void blk_account_io_merge_request(struct request *req)
{
if (blk_do_io_stat(req)) {
- struct hd_struct *part;
-
part_stat_lock();
- part = req->part;
-
- hd_struct_put(part);
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ hd_struct_put(req->part);
part_stat_unlock();
}
}
+
/*
* Two cases of handling DISCARD merge:
* If max_discard_segments > 1, the driver takes every bio
@@ -792,7 +790,7 @@ static struct request *attempt_merge(struct request_queue *q,
/*
* 'next' is going away, so update stats accordingly
*/
- blk_account_io_merge(next);
+ blk_account_io_merge_request(next);
/*
* ownership of bio passed from next to req, return 'next' for
--
2.26.2
From: Konstantin Khlebnikov <[email protected]>
Most architectures have fast path to access percpu for current cpu.
The required preempt_disable() is provided by part_stat_lock().
Signed-off-by: Konstantin Khlebnikov <[email protected]>
[hch: rebased]
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/part_stat.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index a6b0938ce82e9..24125778ef3ec 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
part_stat_read(part, field[STAT_DISCARD]))
#define __part_stat_add(part, field, addnd) \
- (part_stat_get(part, field) += (addnd))
+ __this_cpu_add((part)->dkstats->field, addnd)
#define part_stat_add(part, field, addnd) do { \
__part_stat_add((part), field, addnd); \
--
2.26.2
On 2020/5/25 19:30, Christoph Hellwig wrote:
> Switch bcache to use the nicer bio accounting helpers, and call the
> routines where we also sample the start time to give coherent accounting
> results.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Coly Li <[email protected]>
Coly Li
> ---
> drivers/md/bcache/request.c | 18 ++++--------------
> 1 file changed, 4 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 77d1a26975174..22b483527176b 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -668,9 +668,7 @@ static void backing_request_endio(struct bio *bio)
> static void bio_complete(struct search *s)
> {
> if (s->orig_bio) {
> - generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
> - &s->d->disk->part0, s->start_time);
> -
> + bio_end_io_acct(s->orig_bio, s->start_time);
> trace_bcache_request_end(s->d, s->orig_bio);
> s->orig_bio->bi_status = s->iop.status;
> bio_endio(s->orig_bio);
> @@ -730,7 +728,7 @@ static inline struct search *search_alloc(struct bio *bio,
> s->recoverable = 1;
> s->write = op_is_write(bio_op(bio));
> s->read_dirty_data = 0;
> - s->start_time = jiffies;
> + s->start_time = bio_start_io_acct(bio);
>
> s->iop.c = d->c;
> s->iop.bio = NULL;
> @@ -1082,8 +1080,7 @@ static void detached_dev_end_io(struct bio *bio)
> bio->bi_end_io = ddip->bi_end_io;
> bio->bi_private = ddip->bi_private;
>
> - generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
> - &ddip->d->disk->part0, ddip->start_time);
> + bio_end_io_acct(bio, ddip->start_time);
>
> if (bio->bi_status) {
> struct cached_dev *dc = container_of(ddip->d,
> @@ -1108,7 +1105,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
> */
> ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
> ddip->d = d;
> - ddip->start_time = jiffies;
> + ddip->start_time = bio_start_io_acct(bio);
> ddip->bi_end_io = bio->bi_end_io;
> ddip->bi_private = bio->bi_private;
> bio->bi_end_io = detached_dev_end_io;
> @@ -1190,11 +1187,6 @@ blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio)
> }
> }
>
> - generic_start_io_acct(q,
> - bio_op(bio),
> - bio_sectors(bio),
> - &d->disk->part0);
> -
> bio_set_dev(bio, dc->bdev);
> bio->bi_iter.bi_sector += dc->sb.data_offset;
>
> @@ -1311,8 +1303,6 @@ blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio)
> return BLK_QC_T_NONE;
> }
>
> - generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
> -
> s = search_alloc(bio, d);
> cl = &s->cl;
> bio = &s->bio.bio;
>