bio size can grow up to 4GB when muli-page bvec is enabled.
but sometimes it would lead to inefficient behaviors.
in case of large chunk direct I/O, - 32MB chunk read in user space -
all pages for 32MB would be merged to a bio structure if the pages
physical addresses are contiguous. it makes some delay to submit
until merge complete. bio max size should be limited to a proper size.
When 32MB chunk read with direct I/O option is coming from userspace,
kernel behavior is below now. it's timeline.
| bio merge for 32MB. total 8,192 pages are merged.
| total elapsed time is over 2ms.
|------------------ ... ----------------------->|
| 8,192 pages merged a bio.
| at this time, first bio submit is done.
| 1 bio is split to 32 read request and issue.
|--------------->
|--------------->
|--------------->
......
|--------------->
|--------------->|
total 19ms elapsed to complete 32MB read done from device. |
If bio max size is limited with 1MB, behavior is changed below.
| bio merge for 1MB. 256 pages are merged for each bio.
| total 32 bio will be made.
| total elapsed time is over 2ms. it's same.
| but, first bio submit timing is fast. about 100us.
|--->|--->|--->|---> ... -->|--->|--->|--->|--->|
| 256 pages merged a bio.
| at this time, first bio submit is done.
| and 1 read request is issued for 1 bio.
|--------------->
|--------------->
|--------------->
......
|--------------->
|--------------->|
total 17ms elapsed to complete 32MB read done from device. |
As a result, read request issue timing is faster if bio max size is limited.
Current kernel behavior with multipage bvec, super large bio can be created.
And it lead to delay first I/O request issue.
Signed-off-by: Changheun Lee <[email protected]>
---
block/bio.c | 17 ++++++++++++++++-
include/linux/bio.h | 4 +++-
include/linux/blkdev.h | 3 +++
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 1f2cc1fbe283..ec0281889045 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
}
EXPORT_SYMBOL(bio_init);
+unsigned int bio_max_size(struct bio *bio)
+{
+ struct request_queue *q;
+
+ if (!bio->bi_disk)
+ return UINT_MAX;
+
+ q = bio->bi_disk->queue;
+ if (!blk_queue_limit_bio_size(q))
+ return UINT_MAX;
+
+ return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
+}
+EXPORT_SYMBOL(bio_max_size);
+
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
@@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
- if (bio->bi_iter.bi_size > UINT_MAX - len) {
+ if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
*same_page = false;
return false;
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1edda614f7ce..cdb134ca7bf5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
return NULL;
}
+extern unsigned int bio_max_size(struct bio *);
+
/**
* bio_full - check if the bio is full
* @bio: bio to check
@@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
if (bio->bi_vcnt >= bio->bi_max_vecs)
return true;
- if (bio->bi_iter.bi_size > UINT_MAX - len)
+ if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
return true;
return false;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94ee3089e01..3aeab9e7e97b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -621,6 +621,7 @@ struct request_queue {
#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
+#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP) | \
@@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
+#define blk_queue_limit_bio_size(q) \
+ test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
--
2.28.0
Add new sysfs node to limit bio size.
Signed-off-by: Changheun Lee <[email protected]>
---
block/blk-sysfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b513f1683af0..840d97f427e6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -288,6 +288,7 @@ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
+QUEUE_SYSFS_BIT_FNS(limit_bio_size, LIMIT_BIO_SIZE, 0);
#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
@@ -615,6 +616,7 @@ QUEUE_RW_ENTRY(queue_nonrot, "rotational");
QUEUE_RW_ENTRY(queue_iostats, "iostats");
QUEUE_RW_ENTRY(queue_random, "add_random");
QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+QUEUE_RW_ENTRY(queue_limit_bio_size, "limit_bio_size");
static struct attribute *queue_attrs[] = {
&queue_requests_entry.attr,
@@ -648,6 +650,7 @@ static struct attribute *queue_attrs[] = {
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
+ &queue_limit_bio_size_entry.attr,
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
--
2.28.0
On 2021/01/26 12:58, Ming Lei wrote:
> On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
>> bio size can grow up to 4GB when muli-page bvec is enabled.
>> but sometimes it would lead to inefficient behaviors.
>> in case of large chunk direct I/O, - 32MB chunk read in user space -
>> all pages for 32MB would be merged to a bio structure if the pages
>> physical addresses are contiguous. it makes some delay to submit
>> until merge complete. bio max size should be limited to a proper size.
>>
>> When 32MB chunk read with direct I/O option is coming from userspace,
>> kernel behavior is below now. it's timeline.
>>
>> | bio merge for 32MB. total 8,192 pages are merged.
>> | total elapsed time is over 2ms.
>> |------------------ ... ----------------------->|
>> | 8,192 pages merged a bio.
>> | at this time, first bio submit is done.
>> | 1 bio is split to 32 read request and issue.
>> |--------------->
>> |--------------->
>> |--------------->
>> ......
>> |--------------->
>> |--------------->|
>> total 19ms elapsed to complete 32MB read done from device. |
>>
>> If bio max size is limited with 1MB, behavior is changed below.
>>
>> | bio merge for 1MB. 256 pages are merged for each bio.
>> | total 32 bio will be made.
>> | total elapsed time is over 2ms. it's same.
>> | but, first bio submit timing is fast. about 100us.
>> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
>> | 256 pages merged a bio.
>> | at this time, first bio submit is done.
>> | and 1 read request is issued for 1 bio.
>> |--------------->
>> |--------------->
>> |--------------->
>> ......
>> |--------------->
>> |--------------->|
>> total 17ms elapsed to complete 32MB read done from device. |
>>
>> As a result, read request issue timing is faster if bio max size is limited.
>> Current kernel behavior with multipage bvec, super large bio can be created.
>> And it lead to delay first I/O request issue.
>>
>> Signed-off-by: Changheun Lee <[email protected]>
>> ---
>> block/bio.c | 17 ++++++++++++++++-
>> include/linux/bio.h | 4 +++-
>> include/linux/blkdev.h | 3 +++
>> 3 files changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index 1f2cc1fbe283..ec0281889045 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>> }
>> EXPORT_SYMBOL(bio_init);
>>
>> +unsigned int bio_max_size(struct bio *bio)
>> +{
>> + struct request_queue *q;
>> +
>> + if (!bio->bi_disk)
>> + return UINT_MAX;
>> +
>> + q = bio->bi_disk->queue;
>> + if (!blk_queue_limit_bio_size(q))
>> + return UINT_MAX;
>> +
>> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
>> +}
>> +EXPORT_SYMBOL(bio_max_size);
>> +
>> /**
>> * bio_reset - reinitialize a bio
>> * @bio: bio to reset
>> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
>> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
>>
>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
>> *same_page = false;
>> return false;
>> }
>
> So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
> Christoph's patch) during adding page to bio, so there is null ptr
> refereance risk.
>
>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>> index 1edda614f7ce..cdb134ca7bf5 100644
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
>> return NULL;
>> }
>>
>> +extern unsigned int bio_max_size(struct bio *);
>> +
>> /**
>> * bio_full - check if the bio is full
>> * @bio: bio to check
>> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
>> if (bio->bi_vcnt >= bio->bi_max_vecs)
>> return true;
>>
>> - if (bio->bi_iter.bi_size > UINT_MAX - len)
>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
>> return true;
>>
>> return false;
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index f94ee3089e01..3aeab9e7e97b 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -621,6 +621,7 @@ struct request_queue {
>> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
>> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
>> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
>> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
>>
>> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
>> (1 << QUEUE_FLAG_SAME_COMP) | \
>> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
>> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
>> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
>> +#define blk_queue_limit_bio_size(q) \
>> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
>
> I don't think it is a good idea by adding queue flag for this purpose,
> since this case just needs to limit bio size for not delay bio submission
> too much, which is kind of logical thing, nothing to do with request queue.
>
> Just wondering why you don't take the following way:
>
>
> diff --git a/block/bio.c b/block/bio.c
> index 99040a7e6656..35852f7f47d4 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> * It's intended for direct IO, so doesn't do PSI tracking, the caller is
> * responsible for setting BIO_WORKINGSET if necessary.
> */
> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
> {
> int ret = 0;
>
> @@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> bio_set_flag(bio, BIO_NO_PAGE_REF);
> return 0;
> } else {
> + /*
> + * Don't add too many pages in case of sync dio for
> + * avoiding delay bio submission too much especially
> + * pinning user pages in memory isn't cheap.
> + */
> + const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
4KB max bio size ? That is a little small :)
In any case, I am not a fan of using an arbitrary value not related to the
actual device characteristics. Wouldn't it be better to us the device
max_sectors limit ? And that limit would need to be zone_append_max_sectors for
zone append writes. So some helper like Changheun bio_max_size() may be useful.
Apart from this point, I like your approach.
> +
> do {
> if (bio_op(bio) == REQ_OP_ZONE_APPEND)
> ret = __bio_iov_append_get_pages(bio, iter);
> else
> ret = __bio_iov_iter_get_pages(bio, iter);
> - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
> + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0) &&
> + bio->bi_iter.bi_size < max_size);
> }
>
> /* don't account direct I/O as memory stall */
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 6f5bd9950baf..0d1d436aca17 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -246,7 +246,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
> bio.bi_end_io = blkdev_bio_end_io_simple;
> bio.bi_ioprio = iocb->ki_ioprio;
>
> - ret = bio_iov_iter_get_pages(&bio, iter);
> + ret = bio_iov_iter_get_pages(&bio, iter, true);
> if (unlikely(ret))
> goto out;
> ret = bio.bi_iter.bi_size;
> @@ -397,7 +397,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
> bio->bi_end_io = blkdev_bio_end_io;
> bio->bi_ioprio = iocb->ki_ioprio;
>
> - ret = bio_iov_iter_get_pages(bio, iter);
> + ret = bio_iov_iter_get_pages(bio, iter, is_sync);
> if (unlikely(ret)) {
> bio->bi_status = BLK_STS_IOERR;
> bio_endio(bio);
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ea1e8f696076..5105982a9bf8 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
> bio->bi_private = dio;
> bio->bi_end_io = iomap_dio_bio_end_io;
>
> - ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
> + ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
> + is_sync_kiocb(dio->iocb));
> if (unlikely(ret)) {
> /*
> * We have to stop part way through an IO. We must fall
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index bec47f2d074b..c95ac37f9305 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -690,7 +690,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
> if (iocb->ki_flags & IOCB_DSYNC)
> bio->bi_opf |= REQ_FUA;
>
> - ret = bio_iov_iter_get_pages(bio, from);
> + ret = bio_iov_iter_get_pages(bio, from, is_sync_kiocb(iocb));
> if (unlikely(ret))
> goto out_release;
>
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 676870b2c88d..fa3a503b955c 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -472,7 +472,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> unsigned int len, unsigned int off, bool *same_page);
> void __bio_add_page(struct bio *bio, struct page *page,
> unsigned int len, unsigned int off);
> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync);
> void bio_release_pages(struct bio *bio, bool mark_dirty);
> extern void bio_set_pages_dirty(struct bio *bio);
> extern void bio_check_pages_dirty(struct bio *bio);
>
>
> Thanks,
> Ming
>
>
--
Damien Le Moal
Western Digital Research
On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
> bio size can grow up to 4GB when muli-page bvec is enabled.
> but sometimes it would lead to inefficient behaviors.
> in case of large chunk direct I/O, - 32MB chunk read in user space -
> all pages for 32MB would be merged to a bio structure if the pages
> physical addresses are contiguous. it makes some delay to submit
> until merge complete. bio max size should be limited to a proper size.
>
> When 32MB chunk read with direct I/O option is coming from userspace,
> kernel behavior is below now. it's timeline.
>
> | bio merge for 32MB. total 8,192 pages are merged.
> | total elapsed time is over 2ms.
> |------------------ ... ----------------------->|
> | 8,192 pages merged a bio.
> | at this time, first bio submit is done.
> | 1 bio is split to 32 read request and issue.
> |--------------->
> |--------------->
> |--------------->
> ......
> |--------------->
> |--------------->|
> total 19ms elapsed to complete 32MB read done from device. |
>
> If bio max size is limited with 1MB, behavior is changed below.
>
> | bio merge for 1MB. 256 pages are merged for each bio.
> | total 32 bio will be made.
> | total elapsed time is over 2ms. it's same.
> | but, first bio submit timing is fast. about 100us.
> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> | 256 pages merged a bio.
> | at this time, first bio submit is done.
> | and 1 read request is issued for 1 bio.
> |--------------->
> |--------------->
> |--------------->
> ......
> |--------------->
> |--------------->|
> total 17ms elapsed to complete 32MB read done from device. |
>
> As a result, read request issue timing is faster if bio max size is limited.
> Current kernel behavior with multipage bvec, super large bio can be created.
> And it lead to delay first I/O request issue.
>
> Signed-off-by: Changheun Lee <[email protected]>
> ---
> block/bio.c | 17 ++++++++++++++++-
> include/linux/bio.h | 4 +++-
> include/linux/blkdev.h | 3 +++
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 1f2cc1fbe283..ec0281889045 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> }
> EXPORT_SYMBOL(bio_init);
>
> +unsigned int bio_max_size(struct bio *bio)
> +{
> + struct request_queue *q;
> +
> + if (!bio->bi_disk)
> + return UINT_MAX;
> +
> + q = bio->bi_disk->queue;
> + if (!blk_queue_limit_bio_size(q))
> + return UINT_MAX;
> +
> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> +}
> +EXPORT_SYMBOL(bio_max_size);
> +
> /**
> * bio_reset - reinitialize a bio
> * @bio: bio to reset
> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
>
> if (page_is_mergeable(bv, page, len, off, same_page)) {
> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> *same_page = false;
> return false;
> }
So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
Christoph's patch) during adding page to bio, so there is null ptr
refereance risk.
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..cdb134ca7bf5 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> return NULL;
> }
>
> +extern unsigned int bio_max_size(struct bio *);
> +
> /**
> * bio_full - check if the bio is full
> * @bio: bio to check
> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> if (bio->bi_vcnt >= bio->bi_max_vecs)
> return true;
>
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> return true;
>
> return false;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f94ee3089e01..3aeab9e7e97b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -621,6 +621,7 @@ struct request_queue {
> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
>
> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> (1 << QUEUE_FLAG_SAME_COMP) | \
> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> +#define blk_queue_limit_bio_size(q) \
> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
I don't think it is a good idea by adding queue flag for this purpose,
since this case just needs to limit bio size for not delay bio submission
too much, which is kind of logical thing, nothing to do with request queue.
Just wondering why you don't take the following way:
diff --git a/block/bio.c b/block/bio.c
index 99040a7e6656..35852f7f47d4 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
* It's intended for direct IO, so doesn't do PSI tracking, the caller is
* responsible for setting BIO_WORKINGSET if necessary.
*/
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
{
int ret = 0;
@@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_NO_PAGE_REF);
return 0;
} else {
+ /*
+ * Don't add too many pages in case of sync dio for
+ * avoiding delay bio submission too much especially
+ * pinning user pages in memory isn't cheap.
+ */
+ const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
+
do {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = __bio_iov_append_get_pages(bio, iter);
else
ret = __bio_iov_iter_get_pages(bio, iter);
- } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
+ } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0) &&
+ bio->bi_iter.bi_size < max_size);
}
/* don't account direct I/O as memory stall */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6f5bd9950baf..0d1d436aca17 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -246,7 +246,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_end_io = blkdev_bio_end_io_simple;
bio.bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(&bio, iter);
+ ret = bio_iov_iter_get_pages(&bio, iter, true);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -397,7 +397,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
- ret = bio_iov_iter_get_pages(bio, iter);
+ ret = bio_iov_iter_get_pages(bio, iter, is_sync);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ea1e8f696076..5105982a9bf8 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -277,7 +277,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
+ ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
+ is_sync_kiocb(dio->iocb));
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bec47f2d074b..c95ac37f9305 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -690,7 +690,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
- ret = bio_iov_iter_get_pages(bio, from);
+ ret = bio_iov_iter_get_pages(bio, from, is_sync_kiocb(iocb));
if (unlikely(ret))
goto out_release;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 676870b2c88d..fa3a503b955c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -472,7 +472,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page);
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
-int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync);
void bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
Thanks,
Ming
On Tue, Jan 26, 2021 at 04:06:06AM +0000, Damien Le Moal wrote:
> On 2021/01/26 12:58, Ming Lei wrote:
> > On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
> >> bio size can grow up to 4GB when muli-page bvec is enabled.
> >> but sometimes it would lead to inefficient behaviors.
> >> in case of large chunk direct I/O, - 32MB chunk read in user space -
> >> all pages for 32MB would be merged to a bio structure if the pages
> >> physical addresses are contiguous. it makes some delay to submit
> >> until merge complete. bio max size should be limited to a proper size.
> >>
> >> When 32MB chunk read with direct I/O option is coming from userspace,
> >> kernel behavior is below now. it's timeline.
> >>
> >> | bio merge for 32MB. total 8,192 pages are merged.
> >> | total elapsed time is over 2ms.
> >> |------------------ ... ----------------------->|
> >> | 8,192 pages merged a bio.
> >> | at this time, first bio submit is done.
> >> | 1 bio is split to 32 read request and issue.
> >> |--------------->
> >> |--------------->
> >> |--------------->
> >> ......
> >> |--------------->
> >> |--------------->|
> >> total 19ms elapsed to complete 32MB read done from device. |
> >>
> >> If bio max size is limited with 1MB, behavior is changed below.
> >>
> >> | bio merge for 1MB. 256 pages are merged for each bio.
> >> | total 32 bio will be made.
> >> | total elapsed time is over 2ms. it's same.
> >> | but, first bio submit timing is fast. about 100us.
> >> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> >> | 256 pages merged a bio.
> >> | at this time, first bio submit is done.
> >> | and 1 read request is issued for 1 bio.
> >> |--------------->
> >> |--------------->
> >> |--------------->
> >> ......
> >> |--------------->
> >> |--------------->|
> >> total 17ms elapsed to complete 32MB read done from device. |
> >>
> >> As a result, read request issue timing is faster if bio max size is limited.
> >> Current kernel behavior with multipage bvec, super large bio can be created.
> >> And it lead to delay first I/O request issue.
> >>
> >> Signed-off-by: Changheun Lee <[email protected]>
> >> ---
> >> block/bio.c | 17 ++++++++++++++++-
> >> include/linux/bio.h | 4 +++-
> >> include/linux/blkdev.h | 3 +++
> >> 3 files changed, 22 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/bio.c b/block/bio.c
> >> index 1f2cc1fbe283..ec0281889045 100644
> >> --- a/block/bio.c
> >> +++ b/block/bio.c
> >> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> >> }
> >> EXPORT_SYMBOL(bio_init);
> >>
> >> +unsigned int bio_max_size(struct bio *bio)
> >> +{
> >> + struct request_queue *q;
> >> +
> >> + if (!bio->bi_disk)
> >> + return UINT_MAX;
> >> +
> >> + q = bio->bi_disk->queue;
> >> + if (!blk_queue_limit_bio_size(q))
> >> + return UINT_MAX;
> >> +
> >> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> >> +}
> >> +EXPORT_SYMBOL(bio_max_size);
> >> +
> >> /**
> >> * bio_reset - reinitialize a bio
> >> * @bio: bio to reset
> >> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> >> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> >>
> >> if (page_is_mergeable(bv, page, len, off, same_page)) {
> >> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> >> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> >> *same_page = false;
> >> return false;
> >> }
> >
> > So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
> > Christoph's patch) during adding page to bio, so there is null ptr
> > refereance risk.
> >
> >> diff --git a/include/linux/bio.h b/include/linux/bio.h
> >> index 1edda614f7ce..cdb134ca7bf5 100644
> >> --- a/include/linux/bio.h
> >> +++ b/include/linux/bio.h
> >> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> >> return NULL;
> >> }
> >>
> >> +extern unsigned int bio_max_size(struct bio *);
> >> +
> >> /**
> >> * bio_full - check if the bio is full
> >> * @bio: bio to check
> >> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> >> if (bio->bi_vcnt >= bio->bi_max_vecs)
> >> return true;
> >>
> >> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> >> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> >> return true;
> >>
> >> return false;
> >> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> >> index f94ee3089e01..3aeab9e7e97b 100644
> >> --- a/include/linux/blkdev.h
> >> +++ b/include/linux/blkdev.h
> >> @@ -621,6 +621,7 @@ struct request_queue {
> >> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> >> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> >> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> >> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
> >>
> >> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> >> (1 << QUEUE_FLAG_SAME_COMP) | \
> >> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> >> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> >> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> >> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> >> +#define blk_queue_limit_bio_size(q) \
> >> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
> >
> > I don't think it is a good idea by adding queue flag for this purpose,
> > since this case just needs to limit bio size for not delay bio submission
> > too much, which is kind of logical thing, nothing to do with request queue.
> >
> > Just wondering why you don't take the following way:
> >
> >
> > diff --git a/block/bio.c b/block/bio.c
> > index 99040a7e6656..35852f7f47d4 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> > * It's intended for direct IO, so doesn't do PSI tracking, the caller is
> > * responsible for setting BIO_WORKINGSET if necessary.
> > */
> > -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
> > {
> > int ret = 0;
> >
> > @@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > bio_set_flag(bio, BIO_NO_PAGE_REF);
> > return 0;
> > } else {
> > + /*
> > + * Don't add too many pages in case of sync dio for
> > + * avoiding delay bio submission too much especially
> > + * pinning user pages in memory isn't cheap.
> > + */
> > + const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
>
> 4KB max bio size ? That is a little small :)
It should have been (1U << 20), :-(
> In any case, I am not a fan of using an arbitrary value not related to the
> actual device characteristics. Wouldn't it be better to us the device
> max_sectors limit ? And that limit would need to be zone_append_max_sectors for
> zone append writes. So some helper like Changheun bio_max_size() may be useful.
Firstly, bio->bi_disk may not be initialized when adding page to bio; secondly this
limit isn't really related with device, is it? Also if it is one queue limit, it has
to be stacked.
Thanks,
Ming
On 2021/01/26 15:07, Ming Lei wrote:
> On Tue, Jan 26, 2021 at 04:06:06AM +0000, Damien Le Moal wrote:
>> On 2021/01/26 12:58, Ming Lei wrote:
>>> On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>> but sometimes it would lead to inefficient behaviors.
>>>> in case of large chunk direct I/O, - 32MB chunk read in user space -
>>>> all pages for 32MB would be merged to a bio structure if the pages
>>>> physical addresses are contiguous. it makes some delay to submit
>>>> until merge complete. bio max size should be limited to a proper size.
>>>>
>>>> When 32MB chunk read with direct I/O option is coming from userspace,
>>>> kernel behavior is below now. it's timeline.
>>>>
>>>> | bio merge for 32MB. total 8,192 pages are merged.
>>>> | total elapsed time is over 2ms.
>>>> |------------------ ... ----------------------->|
>>>> | 8,192 pages merged a bio.
>>>> | at this time, first bio submit is done.
>>>> | 1 bio is split to 32 read request and issue.
>>>> |--------------->
>>>> |--------------->
>>>> |--------------->
>>>> ......
>>>> |--------------->
>>>> |--------------->|
>>>> total 19ms elapsed to complete 32MB read done from device. |
>>>>
>>>> If bio max size is limited with 1MB, behavior is changed below.
>>>>
>>>> | bio merge for 1MB. 256 pages are merged for each bio.
>>>> | total 32 bio will be made.
>>>> | total elapsed time is over 2ms. it's same.
>>>> | but, first bio submit timing is fast. about 100us.
>>>> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
>>>> | 256 pages merged a bio.
>>>> | at this time, first bio submit is done.
>>>> | and 1 read request is issued for 1 bio.
>>>> |--------------->
>>>> |--------------->
>>>> |--------------->
>>>> ......
>>>> |--------------->
>>>> |--------------->|
>>>> total 17ms elapsed to complete 32MB read done from device. |
>>>>
>>>> As a result, read request issue timing is faster if bio max size is limited.
>>>> Current kernel behavior with multipage bvec, super large bio can be created.
>>>> And it lead to delay first I/O request issue.
>>>>
>>>> Signed-off-by: Changheun Lee <[email protected]>
>>>> ---
>>>> block/bio.c | 17 ++++++++++++++++-
>>>> include/linux/bio.h | 4 +++-
>>>> include/linux/blkdev.h | 3 +++
>>>> 3 files changed, 22 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/block/bio.c b/block/bio.c
>>>> index 1f2cc1fbe283..ec0281889045 100644
>>>> --- a/block/bio.c
>>>> +++ b/block/bio.c
>>>> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>>>> }
>>>> EXPORT_SYMBOL(bio_init);
>>>>
>>>> +unsigned int bio_max_size(struct bio *bio)
>>>> +{
>>>> + struct request_queue *q;
>>>> +
>>>> + if (!bio->bi_disk)
>>>> + return UINT_MAX;
>>>> +
>>>> + q = bio->bi_disk->queue;
>>>> + if (!blk_queue_limit_bio_size(q))
>>>> + return UINT_MAX;
>>>> +
>>>> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
>>>> +}
>>>> +EXPORT_SYMBOL(bio_max_size);
>>>> +
>>>> /**
>>>> * bio_reset - reinitialize a bio
>>>> * @bio: bio to reset
>>>> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
>>>> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
>>>>
>>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>>>> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
>>>> *same_page = false;
>>>> return false;
>>>> }
>>>
>>> So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
>>> Christoph's patch) during adding page to bio, so there is null ptr
>>> refereance risk.
>>>
>>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>>> index 1edda614f7ce..cdb134ca7bf5 100644
>>>> --- a/include/linux/bio.h
>>>> +++ b/include/linux/bio.h
>>>> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
>>>> return NULL;
>>>> }
>>>>
>>>> +extern unsigned int bio_max_size(struct bio *);
>>>> +
>>>> /**
>>>> * bio_full - check if the bio is full
>>>> * @bio: bio to check
>>>> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
>>>> if (bio->bi_vcnt >= bio->bi_max_vecs)
>>>> return true;
>>>>
>>>> - if (bio->bi_iter.bi_size > UINT_MAX - len)
>>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
>>>> return true;
>>>>
>>>> return false;
>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>> index f94ee3089e01..3aeab9e7e97b 100644
>>>> --- a/include/linux/blkdev.h
>>>> +++ b/include/linux/blkdev.h
>>>> @@ -621,6 +621,7 @@ struct request_queue {
>>>> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
>>>> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
>>>> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
>>>> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
>>>>
>>>> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
>>>> (1 << QUEUE_FLAG_SAME_COMP) | \
>>>> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>>>> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
>>>> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
>>>> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
>>>> +#define blk_queue_limit_bio_size(q) \
>>>> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
>>>
>>> I don't think it is a good idea by adding queue flag for this purpose,
>>> since this case just needs to limit bio size for not delay bio submission
>>> too much, which is kind of logical thing, nothing to do with request queue.
>>>
>>> Just wondering why you don't take the following way:
>>>
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 99040a7e6656..35852f7f47d4 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
>>> * It's intended for direct IO, so doesn't do PSI tracking, the caller is
>>> * responsible for setting BIO_WORKINGSET if necessary.
>>> */
>>> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
>>> {
>>> int ret = 0;
>>>
>>> @@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>>> bio_set_flag(bio, BIO_NO_PAGE_REF);
>>> return 0;
>>> } else {
>>> + /*
>>> + * Don't add too many pages in case of sync dio for
>>> + * avoiding delay bio submission too much especially
>>> + * pinning user pages in memory isn't cheap.
>>> + */
>>> + const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
>>
>> 4KB max bio size ? That is a little small :)
>
> It should have been (1U << 20), :-(
Sounds better !
>
>> In any case, I am not a fan of using an arbitrary value not related to the
>> actual device characteristics. Wouldn't it be better to us the device
>> max_sectors limit ? And that limit would need to be zone_append_max_sectors for
>> zone append writes. So some helper like Changheun bio_max_size() may be useful.
>
> Firstly, bio->bi_disk may not be initialized when adding page to bio; secondly this
> limit isn't really related with device, is it? Also if it is one queue limit, it has
> to be stacked.
1MB can be used as a fallback default if the gendisk is not yet set. If it is,
using a queue limit that does not cause bio splitting after submit makes most
sense as that avoid useless overhead. I agree, it is not critical, but it would
be nice to have some number that causes less splitting than the arbitrary 1MB.
E,g, most HDDs will likely have that 1MB BIO split... And max_sectors is a
stacked queue limit, no ? We could use max_hw_sectors too I think.
--
Damien Le Moal
Western Digital Research
On 2021/01/26 18:37, Changheun Lee wrote:
> bio size can grow up to 4GB when muli-page bvec is enabled.
> but sometimes it would lead to inefficient behaviors.
> in case of large chunk direct I/O, - 32MB chunk read in user space -
> all pages for 32MB would be merged to a bio structure if the pages
> physical addresses are contiguous. it makes some delay to submit
> until merge complete. bio max size should be limited to a proper size.
>
> When 32MB chunk read with direct I/O option is coming from userspace,
> kernel behavior is below now. it's timeline.
>
> | bio merge for 32MB. total 8,192 pages are merged.
> | total elapsed time is over 2ms.
> |------------------ ... ----------------------->|
> | 8,192 pages merged a bio.
> | at this time, first bio submit is done.
> | 1 bio is split to 32 read request and issue.
> |--------------->
> |--------------->
> |--------------->
> ......
> |--------------->
> |--------------->|
> total 19ms elapsed to complete 32MB read done from device. |
>
> If bio max size is limited with 1MB, behavior is changed below.
>
> | bio merge for 1MB. 256 pages are merged for each bio.
> | total 32 bio will be made.
> | total elapsed time is over 2ms. it's same.
> | but, first bio submit timing is fast. about 100us.
> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> | 256 pages merged a bio.
> | at this time, first bio submit is done.
> | and 1 read request is issued for 1 bio.
> |--------------->
> |--------------->
> |--------------->
> ......
> |--------------->
> |--------------->|
> total 17ms elapsed to complete 32MB read done from device. |
>
> As a result, read request issue timing is faster if bio max size is limited.
> Current kernel behavior with multipage bvec, super large bio can be created.
> And it lead to delay first I/O request issue.
>
> Signed-off-by: Changheun Lee <[email protected]>
> ---
> block/bio.c | 17 ++++++++++++++++-
> include/linux/bio.h | 4 +++-
> include/linux/blkdev.h | 3 +++
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 1f2cc1fbe283..ec0281889045 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> }
> EXPORT_SYMBOL(bio_init);
>
> +unsigned int bio_max_size(struct bio *bio)
> +{
> + struct request_queue *q;
> +
> + if (!bio->bi_disk)
> + return UINT_MAX;
> +
> + q = bio->bi_disk->queue;
> + if (!blk_queue_limit_bio_size(q))
> + return UINT_MAX;
> +
> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> +}
> +EXPORT_SYMBOL(bio_max_size);
Why export this ? This is only used in this file and in bio.h in bio_full().
> +
> /**
> * bio_reset - reinitialize a bio
> * @bio: bio to reset
> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
>
> if (page_is_mergeable(bv, page, len, off, same_page)) {
> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> *same_page = false;
> return false;
> }
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..cdb134ca7bf5 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> return NULL;
> }
>
> +extern unsigned int bio_max_size(struct bio *);
No need for extern.
> +
> /**
> * bio_full - check if the bio is full
> * @bio: bio to check
> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> if (bio->bi_vcnt >= bio->bi_max_vecs)
> return true;
>
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> return true;
>
> return false;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f94ee3089e01..3aeab9e7e97b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -621,6 +621,7 @@ struct request_queue {
> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
>
> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> (1 << QUEUE_FLAG_SAME_COMP) | \
> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> +#define blk_queue_limit_bio_size(q) \
> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
>
> extern void blk_set_pm_only(struct request_queue *q);
> extern void blk_clear_pm_only(struct request_queue *q);
>
--
Damien Le Moal
Western Digital Research
On Tue, Jan 26, 2021 at 06:26:02AM +0000, Damien Le Moal wrote:
> On 2021/01/26 15:07, Ming Lei wrote:
> > On Tue, Jan 26, 2021 at 04:06:06AM +0000, Damien Le Moal wrote:
> >> On 2021/01/26 12:58, Ming Lei wrote:
> >>> On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
> >>>> bio size can grow up to 4GB when muli-page bvec is enabled.
> >>>> but sometimes it would lead to inefficient behaviors.
> >>>> in case of large chunk direct I/O, - 32MB chunk read in user space -
> >>>> all pages for 32MB would be merged to a bio structure if the pages
> >>>> physical addresses are contiguous. it makes some delay to submit
> >>>> until merge complete. bio max size should be limited to a proper size.
> >>>>
> >>>> When 32MB chunk read with direct I/O option is coming from userspace,
> >>>> kernel behavior is below now. it's timeline.
> >>>>
> >>>> | bio merge for 32MB. total 8,192 pages are merged.
> >>>> | total elapsed time is over 2ms.
> >>>> |------------------ ... ----------------------->|
> >>>> | 8,192 pages merged a bio.
> >>>> | at this time, first bio submit is done.
> >>>> | 1 bio is split to 32 read request and issue.
> >>>> |--------------->
> >>>> |--------------->
> >>>> |--------------->
> >>>> ......
> >>>> |--------------->
> >>>> |--------------->|
> >>>> total 19ms elapsed to complete 32MB read done from device. |
> >>>>
> >>>> If bio max size is limited with 1MB, behavior is changed below.
> >>>>
> >>>> | bio merge for 1MB. 256 pages are merged for each bio.
> >>>> | total 32 bio will be made.
> >>>> | total elapsed time is over 2ms. it's same.
> >>>> | but, first bio submit timing is fast. about 100us.
> >>>> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> >>>> | 256 pages merged a bio.
> >>>> | at this time, first bio submit is done.
> >>>> | and 1 read request is issued for 1 bio.
> >>>> |--------------->
> >>>> |--------------->
> >>>> |--------------->
> >>>> ......
> >>>> |--------------->
> >>>> |--------------->|
> >>>> total 17ms elapsed to complete 32MB read done from device. |
> >>>>
> >>>> As a result, read request issue timing is faster if bio max size is limited.
> >>>> Current kernel behavior with multipage bvec, super large bio can be created.
> >>>> And it lead to delay first I/O request issue.
> >>>>
> >>>> Signed-off-by: Changheun Lee <[email protected]>
> >>>> ---
> >>>> block/bio.c | 17 ++++++++++++++++-
> >>>> include/linux/bio.h | 4 +++-
> >>>> include/linux/blkdev.h | 3 +++
> >>>> 3 files changed, 22 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/block/bio.c b/block/bio.c
> >>>> index 1f2cc1fbe283..ec0281889045 100644
> >>>> --- a/block/bio.c
> >>>> +++ b/block/bio.c
> >>>> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> >>>> }
> >>>> EXPORT_SYMBOL(bio_init);
> >>>>
> >>>> +unsigned int bio_max_size(struct bio *bio)
> >>>> +{
> >>>> + struct request_queue *q;
> >>>> +
> >>>> + if (!bio->bi_disk)
> >>>> + return UINT_MAX;
> >>>> +
> >>>> + q = bio->bi_disk->queue;
> >>>> + if (!blk_queue_limit_bio_size(q))
> >>>> + return UINT_MAX;
> >>>> +
> >>>> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> >>>> +}
> >>>> +EXPORT_SYMBOL(bio_max_size);
> >>>> +
> >>>> /**
> >>>> * bio_reset - reinitialize a bio
> >>>> * @bio: bio to reset
> >>>> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> >>>> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> >>>>
> >>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
> >>>> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> >>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> >>>> *same_page = false;
> >>>> return false;
> >>>> }
> >>>
> >>> So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
> >>> Christoph's patch) during adding page to bio, so there is null ptr
> >>> refereance risk.
> >>>
> >>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
> >>>> index 1edda614f7ce..cdb134ca7bf5 100644
> >>>> --- a/include/linux/bio.h
> >>>> +++ b/include/linux/bio.h
> >>>> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> >>>> return NULL;
> >>>> }
> >>>>
> >>>> +extern unsigned int bio_max_size(struct bio *);
> >>>> +
> >>>> /**
> >>>> * bio_full - check if the bio is full
> >>>> * @bio: bio to check
> >>>> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> >>>> if (bio->bi_vcnt >= bio->bi_max_vecs)
> >>>> return true;
> >>>>
> >>>> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> >>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> >>>> return true;
> >>>>
> >>>> return false;
> >>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> >>>> index f94ee3089e01..3aeab9e7e97b 100644
> >>>> --- a/include/linux/blkdev.h
> >>>> +++ b/include/linux/blkdev.h
> >>>> @@ -621,6 +621,7 @@ struct request_queue {
> >>>> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> >>>> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> >>>> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> >>>> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
> >>>>
> >>>> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> >>>> (1 << QUEUE_FLAG_SAME_COMP) | \
> >>>> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> >>>> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> >>>> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> >>>> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> >>>> +#define blk_queue_limit_bio_size(q) \
> >>>> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
> >>>
> >>> I don't think it is a good idea by adding queue flag for this purpose,
> >>> since this case just needs to limit bio size for not delay bio submission
> >>> too much, which is kind of logical thing, nothing to do with request queue.
> >>>
> >>> Just wondering why you don't take the following way:
> >>>
> >>>
> >>> diff --git a/block/bio.c b/block/bio.c
> >>> index 99040a7e6656..35852f7f47d4 100644
> >>> --- a/block/bio.c
> >>> +++ b/block/bio.c
> >>> @@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> >>> * It's intended for direct IO, so doesn't do PSI tracking, the caller is
> >>> * responsible for setting BIO_WORKINGSET if necessary.
> >>> */
> >>> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >>> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
> >>> {
> >>> int ret = 0;
> >>>
> >>> @@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >>> bio_set_flag(bio, BIO_NO_PAGE_REF);
> >>> return 0;
> >>> } else {
> >>> + /*
> >>> + * Don't add too many pages in case of sync dio for
> >>> + * avoiding delay bio submission too much especially
> >>> + * pinning user pages in memory isn't cheap.
> >>> + */
> >>> + const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
> >>
> >> 4KB max bio size ? That is a little small :)
> >
> > It should have been (1U << 20), :-(
>
> Sounds better !
>
> >
> >> In any case, I am not a fan of using an arbitrary value not related to the
> >> actual device characteristics. Wouldn't it be better to us the device
> >> max_sectors limit ? And that limit would need to be zone_append_max_sectors for
> >> zone append writes. So some helper like Changheun bio_max_size() may be useful.
> >
> > Firstly, bio->bi_disk may not be initialized when adding page to bio; secondly this
> > limit isn't really related with device, is it? Also if it is one queue limit, it has
> > to be stacked.
>
> 1MB can be used as a fallback default if the gendisk is not yet set. If it is,
IMO, only sync dio on slow machine needs such limit because pinning userspace
pages to memory may take a bit long, so far not see other workloads needs this limit.
Even today I get queries from client about why 4MB user space IO won't be converted to
4MB bio, some workload needs big size IO.
> using a queue limit that does not cause bio splitting after submit makes most
> sense as that avoid useless overhead. I agree, it is not critical, but it would
> be nice to have some number that causes less splitting than the arbitrary 1MB.
That is another story. Each fs bio needs two allocation(one fixed length
bio allocation, and variable length of bvec table), however bio splitting just
needs single fixed length bio allocation. So if the source bio(fs bio) for holding
data becomes smaller, splitting may become less, but more fs bio and bvec table
allocation may be involved, not sure this way always gets better performance.
Also in theory, bio splitting may not need to allocate one whole bio
allocation, what matters is just the actual position/size info of the
to-be-splitted bio.
> E,g, most HDDs will likely have that 1MB BIO split... And max_sectors is a
> stacked queue limit, no ? We could use max_hw_sectors too I think.
bio_add_page() is really fast path, and checking queue limit here may
hurt performance because queue_limits reference is added to the fast path.
Thanks,
Ming
On Tue, Jan 26, 2021 at 11:57:48AM +0800, Ming Lei wrote:
> > *same_page = false;
> > return false;
> > }
>
> So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
> Christoph's patch) during adding page to bio, so there is null ptr
> refereance risk.
I've started resurrecting my old plan to always have a valid device
in the bio from allocation time on. It is pretty invasive, but
probably worth it. Sending out the first prep series right now, the
actual changes should be ready for an RFC later this week.
> On Tue, Jan 26, 2021 at 06:26:02AM +0000, Damien Le Moal wrote:
> > On 2021/01/26 15:07, Ming Lei wrote:
> > > On Tue, Jan 26, 2021 at 04:06:06AM +0000, Damien Le Moal wrote:
> > >> On 2021/01/26 12:58, Ming Lei wrote:
> > >>> On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
> > >>>> bio size can grow up to 4GB when muli-page bvec is enabled.
> > >>>> but sometimes it would lead to inefficient behaviors.
> > >>>> in case of large chunk direct I/O, - 32MB chunk read in user space -
> > >>>> all pages for 32MB would be merged to a bio structure if the pages
> > >>>> physical addresses are contiguous. it makes some delay to submit
> > >>>> until merge complete. bio max size should be limited to a proper size.
> > >>>>
> > >>>> When 32MB chunk read with direct I/O option is coming from userspace,
> > >>>> kernel behavior is below now. it's timeline.
> > >>>>
> > >>>> | bio merge for 32MB. total 8,192 pages are merged.
> > >>>> | total elapsed time is over 2ms.
> > >>>> |------------------ ... ----------------------->|
> > >>>> | 8,192 pages merged a bio.
> > >>>> | at this time, first bio submit is done.
> > >>>> | 1 bio is split to 32 read request and issue.
> > >>>> |--------------->
> > >>>> |--------------->
> > >>>> |--------------->
> > >>>> ......
> > >>>> |--------------->
> > >>>> |--------------->|
> > >>>> total 19ms elapsed to complete 32MB read done from device. |
> > >>>>
> > >>>> If bio max size is limited with 1MB, behavior is changed below.
> > >>>>
> > >>>> | bio merge for 1MB. 256 pages are merged for each bio.
> > >>>> | total 32 bio will be made.
> > >>>> | total elapsed time is over 2ms. it's same.
> > >>>> | but, first bio submit timing is fast. about 100us.
> > >>>> |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> > >>>> | 256 pages merged a bio.
> > >>>> | at this time, first bio submit is done.
> > >>>> | and 1 read request is issued for 1 bio.
> > >>>> |--------------->
> > >>>> |--------------->
> > >>>> |--------------->
> > >>>> ......
> > >>>> |--------------->
> > >>>> |--------------->|
> > >>>> total 17ms elapsed to complete 32MB read done from device. |
> > >>>>
> > >>>> As a result, read request issue timing is faster if bio max size is limited.
> > >>>> Current kernel behavior with multipage bvec, super large bio can be created.
> > >>>> And it lead to delay first I/O request issue.
> > >>>>
> > >>>> Signed-off-by: Changheun Lee <[email protected]>
> > >>>> ---
> > >>>> block/bio.c | 17 ++++++++++++++++-
> > >>>> include/linux/bio.h | 4 +++-
> > >>>> include/linux/blkdev.h | 3 +++
> > >>>> 3 files changed, 22 insertions(+), 2 deletions(-)
> > >>>>
> > >>>> diff --git a/block/bio.c b/block/bio.c
> > >>>> index 1f2cc1fbe283..ec0281889045 100644
> > >>>> --- a/block/bio.c
> > >>>> +++ b/block/bio.c
> > >>>> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> > >>>> }
> > >>>> EXPORT_SYMBOL(bio_init);
> > >>>>
> > >>>> +unsigned int bio_max_size(struct bio *bio)
> > >>>> +{
> > >>>> + struct request_queue *q;
> > >>>> +
> > >>>> + if (!bio->bi_disk)
> > >>>> + return UINT_MAX;
> > >>>> +
> > >>>> + q = bio->bi_disk->queue;
> > >>>> + if (!blk_queue_limit_bio_size(q))
> > >>>> + return UINT_MAX;
> > >>>> +
> > >>>> + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> > >>>> +}
> > >>>> +EXPORT_SYMBOL(bio_max_size);
> > >>>> +
> > >>>> /**
> > >>>> * bio_reset - reinitialize a bio
> > >>>> * @bio: bio to reset
> > >>>> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> > >>>> struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> > >>>>
> > >>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
> > >>>> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> > >>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> > >>>> *same_page = false;
> > >>>> return false;
> > >>>> }
> > >>>
> > >>> So far we don't need bio->bi_disk or bio->bi_bdev(will be changed in
> > >>> Christoph's patch) during adding page to bio, so there is null ptr
> > >>> refereance risk.
> > >>>
> > >>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
> > >>>> index 1edda614f7ce..cdb134ca7bf5 100644
> > >>>> --- a/include/linux/bio.h
> > >>>> +++ b/include/linux/bio.h
> > >>>> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> > >>>> return NULL;
> > >>>> }
> > >>>>
> > >>>> +extern unsigned int bio_max_size(struct bio *);
> > >>>> +
> > >>>> /**
> > >>>> * bio_full - check if the bio is full
> > >>>> * @bio: bio to check
> > >>>> @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> > >>>> if (bio->bi_vcnt >= bio->bi_max_vecs)
> > >>>> return true;
> > >>>>
> > >>>> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> > >>>> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> > >>>> return true;
> > >>>>
> > >>>> return false;
> > >>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > >>>> index f94ee3089e01..3aeab9e7e97b 100644
> > >>>> --- a/include/linux/blkdev.h
> > >>>> +++ b/include/linux/blkdev.h
> > >>>> @@ -621,6 +621,7 @@ struct request_queue {
> > >>>> #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> > >>>> #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> > >>>> #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> > >>>> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
> > >>>>
> > >>>> #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> > >>>> (1 << QUEUE_FLAG_SAME_COMP) | \
> > >>>> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> > >>>> #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> > >>>> #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> > >>>> #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> > >>>> +#define blk_queue_limit_bio_size(q) \
> > >>>> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
> > >>>
> > >>> I don't think it is a good idea by adding queue flag for this purpose,
> > >>> since this case just needs to limit bio size for not delay bio submission
> > >>> too much, which is kind of logical thing, nothing to do with request queue.
> > >>>
> > >>> Just wondering why you don't take the following way:
> > >>>
> > >>>
> > >>> diff --git a/block/bio.c b/block/bio.c
> > >>> index 99040a7e6656..35852f7f47d4 100644
> > >>> --- a/block/bio.c
> > >>> +++ b/block/bio.c
> > >>> @@ -1081,7 +1081,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
> > >>> * It's intended for direct IO, so doesn't do PSI tracking, the caller is
> > >>> * responsible for setting BIO_WORKINGSET if necessary.
> > >>> */
> > >>> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >>> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool sync)
> > >>> {
> > >>> int ret = 0;
> > >>>
> > >>> @@ -1092,12 +1092,20 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >>> bio_set_flag(bio, BIO_NO_PAGE_REF);
> > >>> return 0;
> > >>> } else {
> > >>> + /*
> > >>> + * Don't add too many pages in case of sync dio for
> > >>> + * avoiding delay bio submission too much especially
> > >>> + * pinning user pages in memory isn't cheap.
> > >>> + */
> > >>> + const unsigned int max_size = sync ? (1U << 12) : UINT_MAX;
> > >>
> > >> 4KB max bio size ? That is a little small :)
> > >
> > > It should have been (1U << 20), :-(
> >
> > Sounds better !
> >
> > >
> > >> In any case, I am not a fan of using an arbitrary value not related to the
> > >> actual device characteristics. Wouldn't it be better to us the device
> > >> max_sectors limit ? And that limit would need to be zone_append_max_sectors for
> > >> zone append writes. So some helper like Changheun bio_max_size() may be useful.
> > >
> > > Firstly, bio->bi_disk may not be initialized when adding page to bio; secondly this
> > > limit isn't really related with device, is it? Also if it is one queue limit, it has
> > > to be stacked.
> >
> > 1MB can be used as a fallback default if the gendisk is not yet set. If it is,
>
> IMO, only sync dio on slow machine needs such limit because pinning userspace
> pages to memory may take a bit long, so far not see other workloads needs this limit.
>
> Even today I get queries from client about why 4MB user space IO won't be converted to
> 4MB bio, some workload needs big size IO.
>
IMO, your solution is good. But I think it's a scenario specific solution.
Current approach could be better to remove bio submission delay basically.
And this is a option to enable in runtime by queue flag, default is no
limit of bio size. I think this solution affacts little on your work.
> > using a queue limit that does not cause bio splitting after submit makes most
> > sense as that avoid useless overhead. I agree, it is not critical, but it would
> > be nice to have some number that causes less splitting than the arbitrary 1MB.
>
> That is another story. Each fs bio needs two allocation(one fixed length
> bio allocation, and variable length of bvec table), however bio splitting just
> needs single fixed length bio allocation. So if the source bio(fs bio) for holding
> data becomes smaller, splitting may become less, but more fs bio and bvec table
> allocation may be involved, not sure this way always gets better performance.
>
> Also in theory, bio splitting may not need to allocate one whole bio
> allocation, what matters is just the actual position/size info of the
> to-be-splitted bio.
>
> > E,g, most HDDs will likely have that 1MB BIO split... And max_sectors is a
> > stacked queue limit, no ? We could use max_hw_sectors too I think.
>
> bio_add_page() is really fast path, and checking queue limit here may
> hurt performance because queue_limits reference is added to the fast path.
>
Your concern point is good I don't like it too. But it'll be better than
adding new variable in bio structure I think.
Actually adding new check condition is not good itself, But queue flag
checking load is smaller than many condition checks in page_is_mergeable(). :)
>
>
> Thanks,
> Ming
---
Changheun Lee
Samsung Electronics
> On 2021/01/26 18:37, Changheun Lee wrote:
> > bio size can grow up to 4GB when muli-page bvec is enabled.
> > but sometimes it would lead to inefficient behaviors.
> > in case of large chunk direct I/O, - 32MB chunk read in user space -
> > all pages for 32MB would be merged to a bio structure if the pages
> > physical addresses are contiguous. it makes some delay to submit
> > until merge complete. bio max size should be limited to a proper size.
> >
> > When 32MB chunk read with direct I/O option is coming from userspace,
> > kernel behavior is below now. it's timeline.
> >
> > | bio merge for 32MB. total 8,192 pages are merged.
> > | total elapsed time is over 2ms.
> > |------------------ ... ----------------------->|
> > | 8,192 pages merged a bio.
> > | at this time, first bio submit is done.
> > | 1 bio is split to 32 read request and issue.
> > |--------------->
> > |--------------->
> > |--------------->
> > ......
> > |--------------->
> > |--------------->|
> > total 19ms elapsed to complete 32MB read done from device. |
> >
> > If bio max size is limited with 1MB, behavior is changed below.
> >
> > | bio merge for 1MB. 256 pages are merged for each bio.
> > | total 32 bio will be made.
> > | total elapsed time is over 2ms. it's same.
> > | but, first bio submit timing is fast. about 100us.
> > |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
> > | 256 pages merged a bio.
> > | at this time, first bio submit is done.
> > | and 1 read request is issued for 1 bio.
> > |--------------->
> > |--------------->
> > |--------------->
> > ......
> > |--------------->
> > |--------------->|
> > total 17ms elapsed to complete 32MB read done from device. |
> >
> > As a result, read request issue timing is faster if bio max size is limited.
> > Current kernel behavior with multipage bvec, super large bio can be created.
> > And it lead to delay first I/O request issue.
> >
> > Signed-off-by: Changheun Lee <[email protected]>
> > ---
> > block/bio.c | 17 ++++++++++++++++-
> > include/linux/bio.h | 4 +++-
> > include/linux/blkdev.h | 3 +++
> > 3 files changed, 22 insertions(+), 2 deletions(-)
> >
> > diff --git a/block/bio.c b/block/bio.c
> > index 1f2cc1fbe283..ec0281889045 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
> > }
> > EXPORT_SYMBOL(bio_init);
> >
> > +unsigned int bio_max_size(struct bio *bio)
> > +{
> > + struct request_queue *q;
> > +
> > + if (!bio->bi_disk)
> > + return UINT_MAX;
> > +
> > + q = bio->bi_disk->queue;
> > + if (!blk_queue_limit_bio_size(q))
> > + return UINT_MAX;
> > +
> > + return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
> > +}
> > +EXPORT_SYMBOL(bio_max_size);
>
> Why export this ? This is only used in this file and in bio.h in bio_full().
OK. I'll remove it.
>
> > +
> > /**
> > * bio_reset - reinitialize a bio
> > * @bio: bio to reset
> > @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
> > struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> >
> > if (page_is_mergeable(bv, page, len, off, same_page)) {
> > - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> > + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
> > *same_page = false;
> > return false;
> > }
> > diff --git a/include/linux/bio.h b/include/linux/bio.h
> > index 1edda614f7ce..cdb134ca7bf5 100644
> > --- a/include/linux/bio.h
> > +++ b/include/linux/bio.h
> > @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
> > return NULL;
> > }
> >
> > +extern unsigned int bio_max_size(struct bio *);
>
> No need for extern.
It's just for compile warning in my test environment.
I'll remove it too. But I think compile warning could be in the other
.c file which includes bio.h. Is it OK?
>
> > +
> > /**
> > * bio_full - check if the bio is full
> > * @bio: bio to check
> > @@ -113,7 +115,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
> > if (bio->bi_vcnt >= bio->bi_max_vecs)
> > return true;
> >
> > - if (bio->bi_iter.bi_size > UINT_MAX - len)
> > + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
> > return true;
> >
> > return false;
> > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> > index f94ee3089e01..3aeab9e7e97b 100644
> > --- a/include/linux/blkdev.h
> > +++ b/include/linux/blkdev.h
> > @@ -621,6 +621,7 @@ struct request_queue {
> > #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
> > #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
> > #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
> > +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
> >
> > #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
> > (1 << QUEUE_FLAG_SAME_COMP) | \
> > @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
> > #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
> > #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
> > #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> > +#define blk_queue_limit_bio_size(q) \
> > + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
> >
> > extern void blk_set_pm_only(struct request_queue *q);
> > extern void blk_clear_pm_only(struct request_queue *q);
> >
>
>
> --
> Damien Le Moal
> Western Digital Research
--
Changheun Lee
Samsung Electronics