2022-12-18 11:41:04

by hanjinke

[permalink] [raw]
Subject: [PATCH] blk-throtl: Introduce sync and async queues for blk-throtl

From: Jinke Han <[email protected]>

Now we don't distinguish sync write ios from normal buffer write ios
in blk-throtl. A bio with REQ_SYNC tagged always mean it will be wait
until write completion soon after it submit. So it's reasonable for sync
io to complete as soon as possible.

In our test, fio writes a 100g file in sequential 4k blocksize in
a container with low bps limit configured (wbps=10M). More than 1200
ios were throttled in blk-throtl queue and the avarage throtle time
of each io is 140s. At the same time, the operation of saving a small
file by vim will be blocked amolst 140s. As a fsync will be send by vim,
the sync ios of fsync will be blocked by a huge amount of buffer write
ios ahead. This is also a priority inversion problem within one cgroup.
In the database scene, things got really bad with blk-throtle enabled
as fsync is called very often.

This patch splits bio queue into sync and async queues for blk-throtl
and gives a huge priority to sync write ios. Sync queue only make sense
for write ios as we treat all read io as sync io. I think it's a nice
respond to the semantics of REQ_SYNC. Bios with REQ_META and REQ_PRIO
gains the same priority as they are important to fs. This may avoid
some potential priority inversion problems.

Signed-off-by: Jinke Han <[email protected]>
Signed-off-by: Xin Yin <[email protected]>
---
block/blk-throttle.c | 108 +++++++++++++++++++++++++++++++++++++++----
block/blk-throttle.h | 12 ++++-
2 files changed, 111 insertions(+), 9 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 847721dc2b2b..57acfae9f820 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,13 @@
/* Total max dispatch from all groups in one round */
#define THROTL_QUANTUM 32

+/* For write ios, dispatch 4 sync ios and 1 normal io in one loop */
+#define THROTL_SYNC_FACTOR 4
+
+/* Only make sense for write ios, all read ios are treated as SYNC */
+#define SYNC 0
+#define ASYNC 1
+
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
@@ -241,11 +248,27 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
return bio->bi_iter.bi_size;
}

-static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg,
+ bool rw)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios[SYNC]);
+ bio_list_init(&qn->bios[ASYNC]);
qn->tg = tg;
+ qn->dispatch_sync_cnt = (rw == READ) ? UINT_MAX : 0;
+}
+
+#define BLK_THROTL_SYNC(bio) (bio->bi_opf & (REQ_SYNC | REQ_META | REQ_PRIO))
+
+static inline void throtl_qnode_add_bio_list(struct throtl_qnode *qn,
+ struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+
+ if ((rw == READ) || BLK_THROTL_SYNC(bio))
+ bio_list_add(&qn->bios[SYNC], bio);
+ else
+ bio_list_add(&qn->bios[ASYNC], bio);
}

/**
@@ -261,13 +284,34 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
struct list_head *queued)
{
- bio_list_add(&qn->bios, bio);
+ throtl_qnode_add_bio_list(qn, bio);
if (list_empty(&qn->node)) {
list_add_tail(&qn->node, queued);
blkg_get(tg_to_blkg(qn->tg));
}
}

+static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
+{
+ struct bio *bio1, *bio2;
+
+ /* qn for read ios */
+ if (qn->dispatch_sync_cnt == UINT_MAX)
+ return bio_list_peek(&qn->bios[SYNC]);
+
+ /* qn for write ios */
+ bio1 = bio_list_peek(&qn->bios[SYNC]);
+ bio2 = bio_list_peek(&qn->bios[ASYNC]);
+
+ if (bio1 && bio2) {
+ if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
+ return bio2;
+ return bio1;
+ }
+
+ return bio1 ?: bio2;
+}
+
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
@@ -281,11 +325,59 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;

qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = throtl_qnode_bio_peek(qn);
WARN_ON_ONCE(!bio);
return bio;
}

+/**
+ * throtl_qnode_bio_pop: pop a bio from a qnode
+ * @qn: the qnode to pop a bio from
+ *
+ * For read io qn, just pop bio from sync queu and return.
+ * For write io qn, the target queue to pop was determined by the dispatch_sync_cnt.
+ * Try to pop bio from target queue, fetch the bio and return when it is not empty.
+ * If the target queue empty, pop bio from other queue instead.
+ */
+static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
+{
+ struct bio *bio;
+
+ /* qn for read ios */
+ if (qn->dispatch_sync_cnt == UINT_MAX)
+ return bio_list_pop(&qn->bios[SYNC]);
+
+ /* try to dispatch sync io */
+ if (qn->dispatch_sync_cnt < THROTL_SYNC_FACTOR) {
+ bio = bio_list_pop(&qn->bios[SYNC]);
+ if (bio) {
+ qn->dispatch_sync_cnt++;
+ return bio;
+ }
+ bio = bio_list_pop(&qn->bios[ASYNC]);
+ qn->dispatch_sync_cnt = 0;
+ return bio;
+ }
+
+ /* try to dispatch async io */
+ bio = bio_list_pop(&qn->bios[ASYNC]);
+ if (bio) {
+ qn->dispatch_sync_cnt = 0;
+ return bio;
+ }
+ bio = bio_list_pop(&qn->bios[SYNC]);
+
+ return bio;
+}
+
+static inline bool throtl_qnode_empty(struct throtl_qnode *qn)
+{
+ if (bio_list_empty(&qn->bios[SYNC]) &&
+ bio_list_empty(&qn->bios[ASYNC]))
+ return true;
+ return false;
+}
+
/**
* throtl_pop_queued - pop the first bio form a qnode list
* @queued: the qnode list to pop a bio from
@@ -310,10 +402,10 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;

qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = throtl_qnode_bio_pop(qn);
WARN_ON_ONCE(!bio);

- if (bio_list_empty(&qn->bios)) {
+ if (throtl_qnode_empty(qn)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -355,8 +447,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
throtl_service_queue_init(&tg->service_queue);

for (rw = READ; rw <= WRITE; rw++) {
- throtl_qnode_init(&tg->qnode_on_self[rw], tg);
- throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg, rw);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg, rw);
}

RB_CLEAR_NODE(&tg->rb_node);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index ef4b7a4de987..55f3a9594e0d 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -28,8 +28,18 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
+ struct bio_list bios[2]; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
+
+ /*
+ * 1) for write throtl_qnode:
+ * [0, THROTL_SYNC_FACTOR-1]: dispatch sync io
+ * [THROTL_SYNC_FACTOR]: dispatch async io
+ *
+ * 2) for read throtl_qnode:
+ * UINT_MAX
+ */
+ unsigned int dispatch_sync_cnt; /* sync io dispatch counter */
};

struct throtl_service_queue {
--
2.20.1


2022-12-19 22:04:02

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH] blk-throtl: Introduce sync and async queues for blk-throtl

Hello,

This looks generally fine to me. Some nits below.

> +static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
> +{
> + struct bio *bio1, *bio2;
> +
> + /* qn for read ios */
> + if (qn->dispatch_sync_cnt == UINT_MAX)
> + return bio_list_peek(&qn->bios[SYNC]);
> +
> + /* qn for write ios */
> + bio1 = bio_list_peek(&qn->bios[SYNC]);
> + bio2 = bio_list_peek(&qn->bios[ASYNC]);
> +
> + if (bio1 && bio2) {
> + if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
> + return bio2;
> + return bio1;
> + }
> +
> + return bio1 ?: bio2;
> +}

Wouldn't it be simpler to write:

if (qn->dispatch_sync_count < THROTL_SYNC_FACTOR)
return bio1 ?: bio2;
else
return bio2 ?: bio1;

> +/**
> + * throtl_qnode_bio_pop: pop a bio from a qnode
> + * @qn: the qnode to pop a bio from
> + *
> + * For read io qn, just pop bio from sync queu and return.
> + * For write io qn, the target queue to pop was determined by the dispatch_sync_cnt.
> + * Try to pop bio from target queue, fetch the bio and return when it is not empty.
> + * If the target queue empty, pop bio from other queue instead.
> + */
> +static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
> +{
> + struct bio *bio;
> +
> + /* qn for read ios */
> + if (qn->dispatch_sync_cnt == UINT_MAX)
> + return bio_list_pop(&qn->bios[SYNC]);
> +
> + /* try to dispatch sync io */
> + if (qn->dispatch_sync_cnt < THROTL_SYNC_FACTOR) {
> + bio = bio_list_pop(&qn->bios[SYNC]);
> + if (bio) {
> + qn->dispatch_sync_cnt++;
> + return bio;
> + }
> + bio = bio_list_pop(&qn->bios[ASYNC]);
> + qn->dispatch_sync_cnt = 0;
> + return bio;
> + }
> +
> + /* try to dispatch async io */
> + bio = bio_list_pop(&qn->bios[ASYNC]);
> + if (bio) {
> + qn->dispatch_sync_cnt = 0;
> + return bio;
> + }
> + bio = bio_list_pop(&qn->bios[SYNC]);
> +
> + return bio;
> +}

This also seems like it can be simplified a bit.

Thanks.

--
tejun

2022-12-20 02:48:11

by hanjinke

[permalink] [raw]
Subject: Re: [External] Re: [PATCH] blk-throtl: Introduce sync and async queues for blk-throtl



在 2022/12/20 上午5:22, Tejun Heo 写道:
> Hello,
>
> This looks generally fine to me. Some nits below.
>
>> +static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
>> +{
>> + struct bio *bio1, *bio2;
>> +
>> + /* qn for read ios */
>> + if (qn->dispatch_sync_cnt == UINT_MAX)
>> + return bio_list_peek(&qn->bios[SYNC]);
>> +
>> + /* qn for write ios */
>> + bio1 = bio_list_peek(&qn->bios[SYNC]);
>> + bio2 = bio_list_peek(&qn->bios[ASYNC]);
>> +
>> + if (bio1 && bio2) {
>> + if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
>> + return bio2;
>> + return bio1;
>> + }
>> +
>> + return bio1 ?: bio2;
>> +}
>
> Wouldn't it be simpler to write:
>
> if (qn->dispatch_sync_count < THROTL_SYNC_FACTOR)
> return bio1 ?: bio2;
> else
> return bio2 ?: bio1;
>
>> +/**
>> + * throtl_qnode_bio_pop: pop a bio from a qnode
>> + * @qn: the qnode to pop a bio from
>> + *
>> + * For read io qn, just pop bio from sync queu and return.
>> + * For write io qn, the target queue to pop was determined by the dispatch_sync_cnt.
>> + * Try to pop bio from target queue, fetch the bio and return when it is not empty.
>> + * If the target queue empty, pop bio from other queue instead.
>> + */
>> +static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
>> +{
>> + struct bio *bio;
>> +
>> + /* qn for read ios */
>> + if (qn->dispatch_sync_cnt == UINT_MAX)
>> + return bio_list_pop(&qn->bios[SYNC]);
>> +
>> + /* try to dispatch sync io */
>> + if (qn->dispatch_sync_cnt < THROTL_SYNC_FACTOR) {
>> + bio = bio_list_pop(&qn->bios[SYNC]);
>> + if (bio) {
>> + qn->dispatch_sync_cnt++;
>> + return bio;
>> + }
>> + bio = bio_list_pop(&qn->bios[ASYNC]);
>> + qn->dispatch_sync_cnt = 0;
>> + return bio;
>> + }
>> +
>> + /* try to dispatch async io */
>> + bio = bio_list_pop(&qn->bios[ASYNC]);
>> + if (bio) {
>> + qn->dispatch_sync_cnt = 0;
>> + return bio;
>> + }
>> + bio = bio_list_pop(&qn->bios[SYNC]);
>> +
>> + return bio;
>> +}
>
> This also seems like it can be simplified a bit.
>
> Thanks.
>

Indeed, I will make it more clear and send the v2.

Thanks.

2022-12-21 11:16:51

by hanjinke

[permalink] [raw]
Subject: [PATCH v2] blk-throtl: Introduce sync and async queues for blk-throtl

From: Jinke Han <[email protected]>

Now we don't distinguish sync write ios from normal buffer write ios
in blk-throtl. A bio with REQ_SYNC tagged always mean it will be wait
until write completion soon after it submit. So it's reasonable for sync
io to complete as soon as possible.

In our test, fio writes a 100g file in sequential 4k blocksize in
a container with low bps limit configured (wbps=10M). More than 1200
ios were throttled in blk-throtl queue and the avarage throtle time
of each io is 140s. At the same time, the operation of saving a small
file by vim will be blocked amolst 140s. As a fsync will be send by vim,
the sync ios of fsync will be blocked by a huge amount of buffer write
ios ahead. This is also a priority inversion problem within one cgroup.
In the database scene, things got really bad with blk-throtle enabled
as fsync is called very often.

This patch splits bio queue into sync and async queues for blk-throtl
and gives a huge priority to sync write ios. Sync queue only make sense
for write ios as we treat all read io as sync io. I think it's a nice
respond to the semantics of REQ_SYNC. Bios with REQ_META and REQ_PRIO
gains the same priority as they are important to fs. This may avoid
some potential priority inversion problems.

With this patch, do the same test above, the duration of the fsync sent
by vim drops to servral hundreds of milliseconds.

Signed-off-by: Jinke Han <[email protected]>
Signed-off-by: Xin Yin <[email protected]>
---

Changes in v2:
- Make the code more simple

block/blk-throttle.c | 105 +++++++++++++++++++++++++++++++++++++++----
block/blk-throttle.h | 12 ++++-
2 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 847721dc2b2b..f3285c7da9f7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,13 @@
/* Total max dispatch from all groups in one round */
#define THROTL_QUANTUM 32

+/* For write ios, dispatch 4 sync ios and 1 normal io in one loop */
+#define THROTL_SYNC_FACTOR 4
+
+/* Only make sense for write ios, all read ios are treated as SYNC */
+#define SYNC 0
+#define ASYNC 1
+
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
@@ -241,11 +248,27 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
return bio->bi_iter.bi_size;
}

-static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
+static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg,
+ bool rw)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios[SYNC]);
+ bio_list_init(&qn->bios[ASYNC]);
qn->tg = tg;
+ qn->dispatch_sync_cnt = (rw == READ) ? UINT_MAX : 0;
+}
+
+#define BLK_THROTL_SYNC(bio) (bio->bi_opf & (REQ_SYNC | REQ_META | REQ_PRIO))
+
+static inline void throtl_qnode_add_bio_list(struct throtl_qnode *qn,
+ struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+
+ if ((rw == READ) || BLK_THROTL_SYNC(bio))
+ bio_list_add(&qn->bios[SYNC], bio);
+ else
+ bio_list_add(&qn->bios[ASYNC], bio);
}

/**
@@ -261,13 +284,34 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
struct list_head *queued)
{
- bio_list_add(&qn->bios, bio);
+ throtl_qnode_add_bio_list(qn, bio);
if (list_empty(&qn->node)) {
list_add_tail(&qn->node, queued);
blkg_get(tg_to_blkg(qn->tg));
}
}

+static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
+{
+ struct bio *bio;
+ int from = SYNC;
+
+ /* qn for read ios */
+ if (qn->dispatch_sync_cnt == UINT_MAX)
+ return bio_list_peek(&qn->bios[from]);
+
+ /* qn for write ios */
+ if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
+ from = ASYNC;
+ bio = bio_list_peek(&qn->bios[from]);
+ if (!bio) {
+ from = 1 - from;
+ bio = bio_list_peek(&qn->bios[from]);
+ }
+
+ return bio;
+}
+
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
@@ -281,11 +325,56 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;

qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = throtl_qnode_bio_peek(qn);
WARN_ON_ONCE(!bio);
return bio;
}

+/**
+ * throtl_qnode_bio_pop: pop a bio from a qnode
+ * @qn: the qnode to pop a bio from
+ *
+ * For read io qn, just pop bio from sync queue and return.
+ * For write io qn, the target queue to pop was determined by the dispatch_sync_cnt.
+ * Try to pop bio from target queue, fetch the bio and return when it is not empty.
+ * If the target queue empty, pop bio from other queue instead.
+ */
+static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
+{
+ struct bio *bio;
+ int from = SYNC;
+
+ /* qn for read ios */
+ if (qn->dispatch_sync_cnt == UINT_MAX)
+ return bio_list_pop(&qn->bios[from]);
+
+ /* qn for write ios */
+ if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
+ from = ASYNC;
+
+ bio = bio_list_pop(&qn->bios[from]);
+ if (!bio) {
+ from = 1 - from;
+ bio = bio_list_pop(&qn->bios[from]);
+ }
+
+ if ((qn->dispatch_sync_cnt < THROTL_SYNC_FACTOR) &&
+ (from == SYNC))
+ qn->dispatch_sync_cnt++;
+ else if (from == ASYNC)
+ qn->dispatch_sync_cnt = 0;
+
+ return bio;
+}
+
+static inline bool throtl_qnode_empty(struct throtl_qnode *qn)
+{
+ if (bio_list_empty(&qn->bios[SYNC]) &&
+ bio_list_empty(&qn->bios[ASYNC]))
+ return true;
+ return false;
+}
+
/**
* throtl_pop_queued - pop the first bio form a qnode list
* @queued: the qnode list to pop a bio from
@@ -310,10 +399,10 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;

qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = throtl_qnode_bio_pop(qn);
WARN_ON_ONCE(!bio);

- if (bio_list_empty(&qn->bios)) {
+ if (throtl_qnode_empty(qn)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -355,8 +444,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
throtl_service_queue_init(&tg->service_queue);

for (rw = READ; rw <= WRITE; rw++) {
- throtl_qnode_init(&tg->qnode_on_self[rw], tg);
- throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+ throtl_qnode_init(&tg->qnode_on_self[rw], tg, rw);
+ throtl_qnode_init(&tg->qnode_on_parent[rw], tg, rw);
}

RB_CLEAR_NODE(&tg->rb_node);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index ef4b7a4de987..55f3a9594e0d 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -28,8 +28,18 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
+ struct bio_list bios[2]; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
+
+ /*
+ * 1) for write throtl_qnode:
+ * [0, THROTL_SYNC_FACTOR-1]: dispatch sync io
+ * [THROTL_SYNC_FACTOR]: dispatch async io
+ *
+ * 2) for read throtl_qnode:
+ * UINT_MAX
+ */
+ unsigned int dispatch_sync_cnt; /* sync io dispatch counter */
};

struct throtl_service_queue {
--
2.20.1

2022-12-22 14:02:20

by Michal Koutný

[permalink] [raw]
Subject: Re: [PATCH v2] blk-throtl: Introduce sync and async queues for blk-throtl

Hello Jinke.

On Wed, Dec 21, 2022 at 06:42:46PM +0800, Jinke Han <[email protected]> wrote:
> In our test, fio writes a 100g file in sequential 4k blocksize in
> a container with low bps limit configured (wbps=10M). More than 1200
> ios were throttled in blk-throtl queue and the avarage throtle time
> of each io is 140s. At the same time, the operation of saving a small
> file by vim will be blocked amolst 140s. As a fsync will be send by vim,
> the sync ios of fsync will be blocked by a huge amount of buffer write
> ios ahead. This is also a priority inversion problem within one cgroup.
> In the database scene, things got really bad with blk-throtle enabled
> as fsync is called very often.

I'm trying to make sense of the numbers:
- at 10 MB/s, it's 0.4 ms per 4k block
- there are 1.2k throttled bios that gives waiting time of roughly 0.5s
~ 0.4ms * 1200
- you say that you observe 280 times longer throttling time,
- that'd mean there should be 340k queued bios
- or cummulative dispatch of ~1400 MB of data

So what are the queued quantities? Are there more than 1200 bios or are
they bigger than the 4k you mention?

Thanks for clarification.

(I acknowledge the possible problem with a large population of async
writes delaying scarce sync writes.)

Michal


Attachments:
(No filename) (1.29 kB)
signature.asc (235.00 B)
Digital signature
Download all attachments

2022-12-22 15:54:32

by hanjinke

[permalink] [raw]
Subject: Re: [External] Re: [PATCH v2] blk-throtl: Introduce sync and async queues for blk-throtl



在 2022/12/22 下午9:39, Michal Koutný 写道:
> Hello Jinke.
>
> On Wed, Dec 21, 2022 at 06:42:46PM +0800, Jinke Han <[email protected]> wrote:
>> In our test, fio writes a 100g file in sequential 4k blocksize in
>> a container with low bps limit configured (wbps=10M). More than 1200
>> ios were throttled in blk-throtl queue and the avarage throtle time
>> of each io is 140s. At the same time, the operation of saving a small
>> file by vim will be blocked amolst 140s. As a fsync will be send by vim,
>> the sync ios of fsync will be blocked by a huge amount of buffer write
>> ios ahead. This is also a priority inversion problem within one cgroup.
>> In the database scene, things got really bad with blk-throtle enabled
>> as fsync is called very often.
>
> I'm trying to make sense of the numbers:
> - at 10 MB/s, it's 0.4 ms per 4k block
> - there are 1.2k throttled bios that gives waiting time of roughly 0.5s
> ~ 0.4ms * 1200
> - you say that you observe 280 times longer throttling time,
> - that'd mean there should be 340k queued bios
> - or cummulative dispatch of ~1400 MB of data
>
Hi
Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s
%rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdb 0.00 11.00 0.00 8.01 0.00 0.00
0.00 0.00 0.00 7.18 0.08 0.00 745.45 3.27 3.60

Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s
%rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdb 0.00 8.00 0.00 9.14 0.00 0.00
0.00 0.00 0.00 7.38 0.06 0.00 1170.00 2.62 2.10

Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s
%rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdb 0.00 16.00 0.00 12.02 0.00 12.00
0.00 42.86 0.00 7.25 0.12 0.00 769.25 2.06 3.30

Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s
%rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdb 0.00 11.00 0.00 10.91 0.00 1.00
0.00 8.33 0.00 6.82 0.07 0.00 1015.64 2.36 2.60

Device r/s w/s rMB/s wMB/s rrqm/s wrqm/s
%rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util
sdb 0.00 11.00 0.00 9.14 0.00 1.00
0.00 8.33 0.00 6.27 0.07 0.00 850.91 2.55 2.80

I used bcc to trace the time of bio form submit_bio to blk_mq_submit_bio
and found the avarage time was nearly 140s(use bcc trace fsync duration
also get the same result).
The iostat above seem the avaerage of each io nearly 1M, so I have rough
estimate the num of the bio queued is 140s * 10 m / 1m.


> So what are the queued quantities? Are there more than 1200 bios or are
> they bigger than the 4k you mention?
>
"fio writes a 100g file in sequential 4k blocksize"
Bios may be more than 1M as ext4 will merged continuously logic blocks
when physical block also continuously.
> Thanks for clarification.
>
> (I acknowledge the possible problem with a large population of async
> writes delaying scarce sync writes.)
>
> Michal

If the 0.4ms oberved by iostat, the way to estimate the throtle time of
the bio by 0.4ms * 1200 may not work as the 0.4 is duration of the
request from alloc to done.

If the average size of bio is 1m, dispatch one bio should cost 1m/ 10M =
100ms. The queue is fifo, so the average throtle time 100ms * 1400.

Thanks.

2022-12-23 10:01:34

by hanjinke

[permalink] [raw]
Subject: Re: [PATCH v2] blk-throtl: Introduce sync and async queues for blk-throtl

Hi
In the process of testing v2, threre is one issue need to solved.

If a bio can't dispatch because of overlimit, a wait time will
calculated and the dispatch will exit. Then spin lock of request queue
will be released and new io may be queued to the throtl queue. When it's
time to dispatch the waiting bio, another bio(eg: from sync queue) may
be selected to dispatch instead of the waitting bio. The may lead the
waitting bio to be wait more than once. Iff the dispatched bio is
smaller than the waitting bio, slice will be trimmed after each diapatch
,the bandwidth cannot be filled in this case.

I will fix this issue and send the v3.

Thanks.
Jinke.


在 2022/12/21 下午6:42, Jinke Han 写道:
> From: Jinke Han <[email protected]>
>
> Now we don't distinguish sync write ios from normal buffer write ios
> in blk-throtl. A bio with REQ_SYNC tagged always mean it will be wait
> until write completion soon after it submit. So it's reasonable for sync
> io to complete as soon as possible.
>
> In our test, fio writes a 100g file in sequential 4k blocksize in
> a container with low bps limit configured (wbps=10M). More than 1200
> ios were throttled in blk-throtl queue and the avarage throtle time
> of each io is 140s. At the same time, the operation of saving a small
> file by vim will be blocked amolst 140s. As a fsync will be send by vim,
> the sync ios of fsync will be blocked by a huge amount of buffer write
> ios ahead. This is also a priority inversion problem within one cgroup.
> In the database scene, things got really bad with blk-throtle enabled
> as fsync is called very often.
>
> This patch splits bio queue into sync and async queues for blk-throtl
> and gives a huge priority to sync write ios. Sync queue only make sense
> for write ios as we treat all read io as sync io. I think it's a nice
> respond to the semantics of REQ_SYNC. Bios with REQ_META and REQ_PRIO
> gains the same priority as they are important to fs. This may avoid
> some potential priority inversion problems.
>
> With this patch, do the same test above, the duration of the fsync sent
> by vim drops to servral hundreds of milliseconds.
>
> Signed-off-by: Jinke Han <[email protected]>
> Signed-off-by: Xin Yin <[email protected]>
> ---
>
> Changes in v2:
> - Make the code more simple
>
> block/blk-throttle.c | 105 +++++++++++++++++++++++++++++++++++++++----
> block/blk-throttle.h | 12 ++++-
> 2 files changed, 108 insertions(+), 9 deletions(-)
>
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 847721dc2b2b..f3285c7da9f7 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -21,6 +21,13 @@
> /* Total max dispatch from all groups in one round */
> #define THROTL_QUANTUM 32
>
> +/* For write ios, dispatch 4 sync ios and 1 normal io in one loop */
> +#define THROTL_SYNC_FACTOR 4
> +
> +/* Only make sense for write ios, all read ios are treated as SYNC */
> +#define SYNC 0
> +#define ASYNC 1
> +
> /* Throttling is performed over a slice and after that slice is renewed */
> #define DFL_THROTL_SLICE_HD (HZ / 10)
> #define DFL_THROTL_SLICE_SSD (HZ / 50)
> @@ -241,11 +248,27 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
> return bio->bi_iter.bi_size;
> }
>
> -static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
> +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg,
> + bool rw)
> {
> INIT_LIST_HEAD(&qn->node);
> - bio_list_init(&qn->bios);
> + bio_list_init(&qn->bios[SYNC]);
> + bio_list_init(&qn->bios[ASYNC]);
> qn->tg = tg;
> + qn->dispatch_sync_cnt = (rw == READ) ? UINT_MAX : 0;
> +}
> +
> +#define BLK_THROTL_SYNC(bio) (bio->bi_opf & (REQ_SYNC | REQ_META | REQ_PRIO))
> +
> +static inline void throtl_qnode_add_bio_list(struct throtl_qnode *qn,
> + struct bio *bio)
> +{
> + bool rw = bio_data_dir(bio);
> +
> + if ((rw == READ) || BLK_THROTL_SYNC(bio))
> + bio_list_add(&qn->bios[SYNC], bio);
> + else
> + bio_list_add(&qn->bios[ASYNC], bio);
> }
>
> /**
> @@ -261,13 +284,34 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
> static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
> struct list_head *queued)
> {
> - bio_list_add(&qn->bios, bio);
> + throtl_qnode_add_bio_list(qn, bio);
> if (list_empty(&qn->node)) {
> list_add_tail(&qn->node, queued);
> blkg_get(tg_to_blkg(qn->tg));
> }
> }
>
> +static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
> +{
> + struct bio *bio;
> + int from = SYNC;
> +
> + /* qn for read ios */
> + if (qn->dispatch_sync_cnt == UINT_MAX)
> + return bio_list_peek(&qn->bios[from]);
> +
> + /* qn for write ios */
> + if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
> + from = ASYNC;
> + bio = bio_list_peek(&qn->bios[from]);
> + if (!bio) {
> + from = 1 - from;
> + bio = bio_list_peek(&qn->bios[from]);
> + }
> +
> + return bio;
> +}
> +
> /**
> * throtl_peek_queued - peek the first bio on a qnode list
> * @queued: the qnode list to peek
> @@ -281,11 +325,56 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
> return NULL;
>
> qn = list_first_entry(queued, struct throtl_qnode, node);
> - bio = bio_list_peek(&qn->bios);
> + bio = throtl_qnode_bio_peek(qn);
> WARN_ON_ONCE(!bio);
> return bio;
> }
>
> +/**
> + * throtl_qnode_bio_pop: pop a bio from a qnode
> + * @qn: the qnode to pop a bio from
> + *
> + * For read io qn, just pop bio from sync queue and return.
> + * For write io qn, the target queue to pop was determined by the dispatch_sync_cnt.
> + * Try to pop bio from target queue, fetch the bio and return when it is not empty.
> + * If the target queue empty, pop bio from other queue instead.
> + */
> +static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
> +{
> + struct bio *bio;
> + int from = SYNC;
> +
> + /* qn for read ios */
> + if (qn->dispatch_sync_cnt == UINT_MAX)
> + return bio_list_pop(&qn->bios[from]);
> +
> + /* qn for write ios */
> + if (qn->dispatch_sync_cnt == THROTL_SYNC_FACTOR)
> + from = ASYNC;
> +
> + bio = bio_list_pop(&qn->bios[from]);
> + if (!bio) {
> + from = 1 - from;
> + bio = bio_list_pop(&qn->bios[from]);
> + }
> +
> + if ((qn->dispatch_sync_cnt < THROTL_SYNC_FACTOR) &&
> + (from == SYNC))
> + qn->dispatch_sync_cnt++;
> + else if (from == ASYNC)
> + qn->dispatch_sync_cnt = 0;
> +
> + return bio;
> +}
> +
> +static inline bool throtl_qnode_empty(struct throtl_qnode *qn)
> +{
> + if (bio_list_empty(&qn->bios[SYNC]) &&
> + bio_list_empty(&qn->bios[ASYNC]))
> + return true;
> + return false;
> +}
> +
> /**
> * throtl_pop_queued - pop the first bio form a qnode list
> * @queued: the qnode list to pop a bio from
> @@ -310,10 +399,10 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
> return NULL;
>
> qn = list_first_entry(queued, struct throtl_qnode, node);
> - bio = bio_list_pop(&qn->bios);
> + bio = throtl_qnode_bio_pop(qn);
> WARN_ON_ONCE(!bio);
>
> - if (bio_list_empty(&qn->bios)) {
> + if (throtl_qnode_empty(qn)) {
> list_del_init(&qn->node);
> if (tg_to_put)
> *tg_to_put = qn->tg;
> @@ -355,8 +444,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
> throtl_service_queue_init(&tg->service_queue);
>
> for (rw = READ; rw <= WRITE; rw++) {
> - throtl_qnode_init(&tg->qnode_on_self[rw], tg);
> - throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
> + throtl_qnode_init(&tg->qnode_on_self[rw], tg, rw);
> + throtl_qnode_init(&tg->qnode_on_parent[rw], tg, rw);
> }
>
> RB_CLEAR_NODE(&tg->rb_node);
> diff --git a/block/blk-throttle.h b/block/blk-throttle.h
> index ef4b7a4de987..55f3a9594e0d 100644
> --- a/block/blk-throttle.h
> +++ b/block/blk-throttle.h
> @@ -28,8 +28,18 @@
> */
> struct throtl_qnode {
> struct list_head node; /* service_queue->queued[] */
> - struct bio_list bios; /* queued bios */
> + struct bio_list bios[2]; /* queued bios */
> struct throtl_grp *tg; /* tg this qnode belongs to */
> +
> + /*
> + * 1) for write throtl_qnode:
> + * [0, THROTL_SYNC_FACTOR-1]: dispatch sync io
> + * [THROTL_SYNC_FACTOR]: dispatch async io
> + *
> + * 2) for read throtl_qnode:
> + * UINT_MAX
> + */
> + unsigned int dispatch_sync_cnt; /* sync io dispatch counter */
> };
>
> struct throtl_service_queue {