2023-10-21 07:52:48

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 0/8] blk-mq: improve tag fair sharing

From: Yu Kuai <[email protected]>

Current implementation:
- a counter active_queues record how many queue/hctx is sharing tags,
and it's updated while issue new IO, and cleared in
blk_mq_timeout_work().
- if active_queues is more than 1, then tags is fair shared to each
node;

New implementation:
- a new field 'available_tags' is added to each node, and it's
calculate in slow path, hence fast path won't be affected, patch 5;
- a new counter 'busy_queues' is added to blk_mq_tags, and it's updated
while fail to get driver tag, and it's also cleared in
blk_mq_timeout_work(), and tag sharing will based on 'busy_queues'
instead of 'active_queues', patch 6,7;
- a new counter 'busy_count' is added to each node to record how many
times a node failed to get driver tag, and it's used to judge if a node
is busy and need more tags, patch 8;
- a new timer is added to blk_mq_tags, it will start if any node failed
to get driver tag, and timer function will be used to borrow tags and
return borrowed tags, patch 8;

A simple test, 32 tags with two shared node:
[global]
ioengine=libaio
iodepth=2
bs=4k
direct=1
rw=randrw
group_reporting

[sda]
numjobs=32
filename=/dev/sda

[sdb]
numjobs=1
filename=/dev/sdb

Test result(monitor new debugfs entry):

time active available
sda sdb sda sdb
0 0 0 32 32
1 16 2 16 16 -> start fair sharing
2 19 2 20 16
3 24 2 24 16
4 26 2 28 16 -> borrow 32/8=4 tags each round
5 28 2 28 16 -> save at lease 4 tags for sdb

Yu Kuai (8):
blk-mq: factor out a structure from blk_mq_tags
blk-mq: factor out a structure to store information for tag sharing
blk-mq: add a helper to initialize shared_tag_info
blk-mq: support to track active queues from blk_mq_tags
blk-mq: precalculate available tags for hctx_may_queue()
blk-mq: add new helpers blk_mq_driver_tag_busy/idle()
blk-mq-tag: delay tag sharing until fail to get driver tag
blk-mq-tag: allow shared queue/hctx to get more driver tags

block/blk-core.c | 2 -
block/blk-mq-debugfs.c | 30 +++++-
block/blk-mq-tag.c | 226 +++++++++++++++++++++++++++++++++++++++--
block/blk-mq.c | 12 ++-
block/blk-mq.h | 64 +++++++-----
include/linux/blk-mq.h | 36 +++++--
include/linux/blkdev.h | 11 +-
7 files changed, 328 insertions(+), 53 deletions(-)

--
2.39.2


2023-10-21 07:52:55

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 6/8] blk-mq: add new helpers blk_mq_driver_tag_busy/idle()

From: Yu Kuai <[email protected]>

Refer to the implementation of blk_mq_tag_busy/idle():

- blk_mq_driver_tag_busy() will be used the first time when get driver
tag failed;
- blk_mq_driver_tag_idle() will be used when driver tag is no longer
exhausted.
- A new counter 'busy_queues' is added to indicate how many shared
queues/hctxs are busy(drivers tags is exhausted);

Tag sharing will be delayed until fail to get driver tag based on these
new helpers.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 2 ++
block/blk-mq-tag.c | 53 +++++++++++++++++++++++++++++++++++++++++-
block/blk-mq.c | 9 +++++--
block/blk-mq.h | 25 ++++++++++++++++----
include/linux/blk-mq.h | 7 ++++--
include/linux/blkdev.h | 1 +
6 files changed, 88 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1d460119f5b3..170bc2236e81 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -417,6 +417,8 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
seq_printf(m, "active_queues=%d\n",
READ_ONCE(tags->ctl.active_queues));
+ seq_printf(m, "busy_queues=%d\n",
+ READ_ONCE(tags->ctl.busy_queues));

seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 261769251282..cd13d8e512f7 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -165,6 +165,51 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
blk_mq_tag_wakeup_all(tags, false);
}

+void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int users;
+ struct blk_mq_tags *tags = hctx->tags;
+
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
+ if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
+ return;
+ } else {
+ if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
+ return;
+ }
+
+ spin_lock_irq(&tags->lock);
+ users = tags->ctl.busy_queues + 1;
+ WRITE_ONCE(tags->ctl.busy_queues, users);
+ spin_unlock_irq(&tags->lock);
+}
+
+void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int users;
+ struct blk_mq_tags *tags = hctx->tags;
+
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
+ if (!test_and_clear_bit(QUEUE_FLAG_HCTX_BUSY,
+ &q->queue_flags))
+ return;
+ } else {
+ if (!test_and_clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
+ return;
+ }
+
+ spin_lock_irq(&tags->lock);
+ users = tags->ctl.busy_queues - 1;
+ WRITE_ONCE(tags->ctl.busy_queues, users);
+ spin_unlock_irq(&tags->lock);
+}
+
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
@@ -218,8 +263,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != BLK_MQ_NO_TAG)
goto found_tag;

- if (data->flags & BLK_MQ_REQ_NOWAIT)
+ if (data->flags & BLK_MQ_REQ_NOWAIT) {
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_driver_tag_busy(data->hctx);
return BLK_MQ_NO_TAG;
+ }

ws = bt_wait_ptr(bt, data->hctx);
do {
@@ -246,6 +294,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != BLK_MQ_NO_TAG)
break;

+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_driver_tag_busy(data->hctx);
+
bt_prev = bt;
io_schedule();

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8775616bc85c..a106533f063f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1668,8 +1668,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
*/
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
- if (blk_mq_hw_queue_mapped(hctx))
+ if (blk_mq_hw_queue_mapped(hctx)) {
blk_mq_tag_idle(hctx);
+ blk_mq_driver_tag_idle(hctx);
+ }
}
}
blk_queue_exit(q);
@@ -3594,8 +3596,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
{
struct request *flush_rq = hctx->fq->flush_rq;

- if (blk_mq_hw_queue_mapped(hctx))
+ if (blk_mq_hw_queue_mapped(hctx)) {
blk_mq_tag_idle(hctx);
+ blk_mq_driver_tag_idle(hctx);
+ }

if (blk_queue_init_done(q))
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
@@ -3931,6 +3935,7 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
} else {
blk_mq_tag_idle(hctx);
+ blk_mq_driver_tag_idle(hctx);
hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
}
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5c0d19562848..3e555af1de49 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -195,8 +195,10 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
return sbq_wait_ptr(bt, &hctx->wait_index);
}

-void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
-void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx);

static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
@@ -210,6 +212,18 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
__blk_mq_tag_idle(hctx);
}

+static inline void blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_driver_tag_busy(hctx);
+}
+
+static inline void blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_driver_tag_idle(hctx);
+}
+
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag)
{
@@ -293,7 +307,8 @@ static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
struct shared_tag_info *info = blk_mq_is_shared_tags(hctx->flags) ?
&hctx->queue->shared_tag_info : &hctx->shared_tag_info;

- atomic_sub(val, &info->active_tags);
+ if (!atomic_sub_return(val, &info->active_tags))
+ blk_mq_driver_tag_idle(hctx);
}

static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
@@ -354,8 +369,10 @@ bool __blk_mq_alloc_driver_tag(struct request *rq);

static inline bool blk_mq_get_driver_tag(struct request *rq)
{
- if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) {
+ blk_mq_driver_tag_busy(rq->mq_hctx);
return false;
+ }

return true;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c93955f5f28f..9182ceca8c7a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -666,10 +666,11 @@ enum {

BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
- BLK_MQ_S_SCHED_RESTART = 2,
+ BLK_MQ_S_DTAG_BUSY = 2,
+ BLK_MQ_S_SCHED_RESTART = 3,

/* hw queue is inactive after all its CPUs become offline */
- BLK_MQ_S_INACTIVE = 3,
+ BLK_MQ_S_INACTIVE = 4,

BLK_MQ_MAX_DEPTH = 10240,

@@ -728,6 +729,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,

struct tag_sharing_ctl {
unsigned int active_queues;
+ /* The number of shared queues/hctxs with exhausted driver tags. */
+ unsigned int busy_queues;
/*
* If driver tags is shared for multiple queue/hctx, this is the head of
* a list with request_queue/hctx->shared_tag_info.node entries.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b364d65fe4e5..8fd6a0a92233 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -552,6 +552,7 @@ struct request_queue {
#define QUEUE_FLAG_DAX 19 /* device supports DAX */
#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */
#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */
+#define QUEUE_FLAG_HCTX_BUSY 23 /* driver tag is exhausted for at least one blk-mq hctx */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
--
2.39.2

2023-10-21 07:53:14

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 5/8] blk-mq: precalculate available tags for hctx_may_queue()

From: Yu Kuai <[email protected]>

Currently, hctx_mq_queue() only need to get how many queues is sharing
tags, then calculate how many tags is available for each queue by fair
sharing.

Add a new field 'available_tags' for struct shared_tag_info to store
how many tags is available directly from slow path, so that
hctx_mq_queue() doesn't need to do calculation.

Currently tags are still fair shared, and now that calculation is in the
slow path, it's okay to refactor tag sharing with more complicated
algorithm, which is implemented in following patches.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 3 ++-
block/blk-mq-tag.c | 35 ++++++++++++++++++++++++++++++++++-
block/blk-mq.c | 4 ++--
block/blk-mq.h | 19 ++++++++-----------
include/linux/blkdev.h | 1 +
5 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index d6ebd8d9d3bb..1d460119f5b3 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -158,7 +158,8 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
static void shared_tag_info_show(struct shared_tag_info *info,
struct seq_file *m)
{
- seq_printf(m, "%d\n", atomic_read(&info->active_tags));
+ seq_printf(m, "active tags %d\n", atomic_read(&info->active_tags));
+ seq_printf(m, "available tags %u\n", READ_ONCE(info->available_tags));
}

static int queue_shared_tag_info_show(void *data, struct seq_file *m)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 07d9b513990b..261769251282 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -14,6 +14,8 @@
#include "blk-mq.h"
#include "blk-mq-sched.h"

+#define shared_tags(tags, users) max((tags->nr_tags + users - 1) / users, 4U)
+
/*
* Recalculate wakeup batch when tag is shared by hctx.
*/
@@ -29,10 +31,39 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
users);
}

-void blk_mq_init_shared_tag_info(struct shared_tag_info *info)
+void blk_mq_init_shared_tag_info(struct shared_tag_info *info,
+ unsigned int nr_tags)
{
atomic_set(&info->active_tags, 0);
INIT_LIST_HEAD(&info->node);
+ info->available_tags = nr_tags;
+}
+
+static void blk_mq_update_available_driver_tags(struct blk_mq_tags *tags,
+ struct shared_tag_info *info,
+ unsigned int users)
+{
+ unsigned int old = tags->ctl.active_queues;
+ int nr_tags;
+ struct shared_tag_info *iter;
+
+ if (!old || !users)
+ return;
+
+ nr_tags = (int)shared_tags(tags, users);
+ if (old < users)
+ WRITE_ONCE(info->available_tags, nr_tags);
+ else
+ WRITE_ONCE(info->available_tags, tags->nr_tags);
+
+ nr_tags -= (int)shared_tags(tags, old);
+ list_for_each_entry(iter, &tags->ctl.head, node) {
+ if (iter == info)
+ continue;
+
+ WRITE_ONCE(iter->available_tags,
+ (unsigned int)((int)iter->available_tags + nr_tags));
+ }
}

/*
@@ -70,6 +101,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
spin_lock_irq(&tags->lock);
list_add(&info->node, &tags->ctl.head);
users = tags->ctl.active_queues + 1;
+ blk_mq_update_available_driver_tags(tags, info, users);
WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);
spin_unlock_irq(&tags->lock);
@@ -121,6 +153,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)

list_del_init(&info->node);
users = tags->ctl.active_queues - 1;
+ blk_mq_update_available_driver_tags(tags, info, users);
WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);

diff --git a/block/blk-mq.c b/block/blk-mq.c
index de5859dd9f52..8775616bc85c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3652,7 +3652,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
goto exit_flush_rq;

- blk_mq_init_shared_tag_info(&hctx->shared_tag_info);
+ blk_mq_init_shared_tag_info(&hctx->shared_tag_info, set->queue_depth);
return 0;

exit_flush_rq:
@@ -4227,7 +4227,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (blk_mq_alloc_ctxs(q))
goto err_exit;

- blk_mq_init_shared_tag_info(&q->shared_tag_info);
+ blk_mq_init_shared_tag_info(&q->shared_tag_info, set->queue_depth);
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);

diff --git a/block/blk-mq.h b/block/blk-mq.h
index ac58f2e22f20..5c0d19562848 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -63,7 +63,8 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx);
-void blk_mq_init_shared_tag_info(struct shared_tag_info *info);
+void blk_mq_init_shared_tag_info(struct shared_tag_info *info,
+ unsigned int nr_tags);

/*
* CPU -> queue mappings
@@ -416,7 +417,7 @@ static inline void blk_mq_free_requests(struct list_head *list)
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *bt)
{
- unsigned int depth, users;
+ struct shared_tag_info *info;

if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return true;
@@ -432,20 +433,16 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,

if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;
+
+ info = &q->shared_tag_info;
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
- }

- users = READ_ONCE(hctx->tags->ctl.active_queues);
- if (!users)
- return true;
+ info = &hctx->shared_tag_info;
+ }

- /*
- * Allow at least some tags
- */
- depth = max((bt->sb.depth + users - 1) / users, 4U);
- return __blk_mq_active_requests(hctx) < depth;
+ return atomic_read(&info->active_tags) < READ_ONCE(info->available_tags);
}

/* run the code block in @dispatch_ops with rcu/srcu read lock held */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f97bc2c7acc9..b364d65fe4e5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -377,6 +377,7 @@ struct blk_independent_access_ranges {

struct shared_tag_info {
atomic_t active_tags;
+ unsigned int available_tags;
struct list_head node;
};

--
2.39.2

2023-10-21 07:53:19

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 7/8] blk-mq-tag: delay tag sharing until fail to get driver tag

From: Yu Kuai <[email protected]>

Before this patch, tags will be shared when shared node start to handle
IO, however, this will waste tags if some node doen't need all the fair
shared tags and such tags can't be used for other node, even if other
node might want more than fair shared tags.

Prevent such problem by delaying tag sharing from issue io until fail
to get driver tag. Note that such problem still exist if all the tags
are exhausted, and the next patch will implement a algorithm to allow
busy node to borrow tags from idle node.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-tag.c | 67 ++++++++++++++++++++++++++--------------------
1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cd13d8e512f7..a98b25c8d594 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -43,7 +43,7 @@ static void blk_mq_update_available_driver_tags(struct blk_mq_tags *tags,
struct shared_tag_info *info,
unsigned int users)
{
- unsigned int old = tags->ctl.active_queues;
+ unsigned int old = tags->ctl.busy_queues;
int nr_tags;
struct shared_tag_info *iter;

@@ -74,9 +74,7 @@ static void blk_mq_update_available_driver_tags(struct blk_mq_tags *tags,
*/
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- unsigned int users;
struct blk_mq_tags *tags = hctx->tags;
- struct shared_tag_info *info;

/*
* calling test_bit() prior to test_and_set_bit() is intentional,
@@ -88,22 +86,14 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return;
-
- info = &q->shared_tag_info;
} else {
if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
-
- info = &hctx->shared_tag_info;
}

spin_lock_irq(&tags->lock);
- list_add(&info->node, &tags->ctl.head);
- users = tags->ctl.active_queues + 1;
- blk_mq_update_available_driver_tags(tags, info, users);
- WRITE_ONCE(tags->ctl.active_queues, users);
- blk_mq_update_wake_batch(tags, users);
+ WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1);
spin_unlock_irq(&tags->lock);
}

@@ -123,9 +113,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
*/
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
- unsigned int users;
struct blk_mq_tags *tags = hctx->tags;
- struct shared_tag_info *info;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -137,8 +125,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
spin_unlock_irq(&tags->lock);
return;
}
-
- info = &q->shared_tag_info;
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
@@ -147,28 +133,21 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
spin_unlock_irq(&tags->lock);
return;
}
-
- info = &hctx->shared_tag_info;
}

- list_del_init(&info->node);
- users = tags->ctl.active_queues - 1;
- blk_mq_update_available_driver_tags(tags, info, users);
- WRITE_ONCE(tags->ctl.active_queues, users);
- blk_mq_update_wake_batch(tags, users);
-
+ WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1);
if (blk_mq_is_shared_tags(hctx->flags))
clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &hctx->queue->queue_flags);
else
clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state);
spin_unlock_irq(&tags->lock);
- blk_mq_tag_wakeup_all(tags, false);
}

void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
struct blk_mq_tags *tags = hctx->tags;
+ struct shared_tag_info *info;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -176,14 +155,21 @@ void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
return;
+
+ info = &q->shared_tag_info;
} else {
if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
return;
+
+ info = &hctx->shared_tag_info;
}

spin_lock_irq(&tags->lock);
+ list_add(&info->node, &tags->ctl.head);
users = tags->ctl.busy_queues + 1;
+ blk_mq_update_available_driver_tags(tags, info, users);
+ blk_mq_update_wake_batch(tags, users);
WRITE_ONCE(tags->ctl.busy_queues, users);
spin_unlock_irq(&tags->lock);
}
@@ -192,22 +178,45 @@ void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
struct blk_mq_tags *tags = hctx->tags;
+ struct shared_tag_info *info;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;

- if (!test_and_clear_bit(QUEUE_FLAG_HCTX_BUSY,
- &q->queue_flags))
+ if (!test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
return;
+
+ spin_lock_irq(&tags->lock);
+ if (!test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) {
+ spin_unlock_irq(&tags->lock);
+ return;
+ }
+ info = &q->shared_tag_info;
} else {
- if (!test_and_clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
+ if (!test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
return;
+
+ spin_lock_irq(&tags->lock);
+ if (!test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) {
+ spin_unlock_irq(&tags->lock);
+ return;
+ }
+ info = &hctx->shared_tag_info;
}

- spin_lock_irq(&tags->lock);
+ list_del_init(&info->node);
users = tags->ctl.busy_queues - 1;
+ blk_mq_update_available_driver_tags(tags, info, users);
+ blk_mq_update_wake_batch(tags, users);
WRITE_ONCE(tags->ctl.busy_queues, users);
+
+ if (blk_mq_is_shared_tags(hctx->flags))
+ clear_bit(QUEUE_FLAG_HCTX_BUSY, &hctx->queue->queue_flags);
+ else
+ clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state);
+
spin_unlock_irq(&tags->lock);
+ blk_mq_tag_wakeup_all(tags, false);
}

static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
--
2.39.2

2023-10-21 07:53:20

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 8/8] blk-mq-tag: allow shared queue/hctx to get more driver tags

From: Yu Kuai <[email protected]>

Currently tags are fail shared to each node, and if some node only
issue litter IO, the shared tags will be wasted and can't be used for
other nodes that is under high IO pressure.

This patch implement a new way to allow one node to borrow tags from
other node:

1) multiple nodes start issue io, and all tags is exhausted, start
sharing tags by fair share;
2) while sharing tag, detect if there are any node need more tags, if so
start a timer;
3) in the timer function
- If there are node with borrowed tags and doesn't busy anymore,
return borrowed tags;
- Find the busy node with highest pressure, and calculate available
free tags from node that is not busy, then borrow tags;

A simple test, 32 tags with two shared node:
[global]
ioengine=libaio
iodepth=2
bs=4k
direct=1
rw=randrw
group_reporting

[sda]
numjobs=32
filename=/dev/sda

[sdb]
numjobs=1
filename=/dev/sdb

Test result(monitor new debugfs entry):

time active available
sda sdb sda sdb
0 0 0 32 32
1 16 2 16 16 -> start fair sharing
2 19 2 20 16
3 24 2 24 16
4 26 2 28 16 -> borrow 32/8=4 tags each round
5 28 2 28 16 -> save at lease 4 tags for sdb

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 1 +
block/blk-mq-tag.c | 92 +++++++++++++++++++++++++++++++++++++++---
include/linux/blk-mq.h | 3 ++
include/linux/blkdev.h | 1 +
4 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 170bc2236e81..a987f4228b67 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,6 +160,7 @@ static void shared_tag_info_show(struct shared_tag_info *info,
{
seq_printf(m, "active tags %d\n", atomic_read(&info->active_tags));
seq_printf(m, "available tags %u\n", READ_ONCE(info->available_tags));
+ seq_printf(m, "busy count %u\n", atomic_read(&info->busy_count));
}

static int queue_shared_tag_info_show(void *data, struct seq_file *m)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a98b25c8d594..798b90d3f11a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -35,6 +35,7 @@ void blk_mq_init_shared_tag_info(struct shared_tag_info *info,
unsigned int nr_tags)
{
atomic_set(&info->active_tags, 0);
+ atomic_set(&info->busy_count, 0);
INIT_LIST_HEAD(&info->node);
info->available_tags = nr_tags;
}
@@ -143,26 +144,93 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
spin_unlock_irq(&tags->lock);
}

+static void tag_sharing_ctl_timer_fn(struct timer_list *t)
+{
+ struct tag_sharing_ctl *ctl = from_timer(ctl, t, timer);
+ struct blk_mq_tags *tags = container_of(ctl, struct blk_mq_tags, ctl);
+ struct shared_tag_info *busy = NULL;
+ struct shared_tag_info *info;
+ unsigned int nr_tags;
+ unsigned int step;
+ unsigned int free_tags = 0;
+ unsigned int borrowed_tags = 0;
+ unsigned int max_busy_count = 0;
+
+ spin_lock_irq(&tags->lock);
+
+ if (tags->ctl.busy_queues <= 1)
+ goto out;
+
+ /* First round, decrease busy_count by half. */
+ list_for_each_entry(info, &tags->ctl.head, node) {
+ int count = atomic_read(&info->busy_count);
+
+ if (count > tags->nr_tags)
+ count = count >> 1;
+ atomic_sub(count, &info->busy_count);
+ }
+
+ /* Second round, find borrowed tags that can be returned. */
+ nr_tags = shared_tags(tags, tags->ctl.busy_queues);
+ step = clamp_t(unsigned int, tags->nr_tags / SBQ_WAIT_QUEUES, 1,
+ SBQ_WAKE_BATCH);
+ list_for_each_entry(info, &tags->ctl.head, node) {
+ if (info->available_tags > nr_tags &&
+ atomic_read(&info->active_tags) <= nr_tags &&
+ atomic_read(&info->busy_count) <= tags->nr_tags)
+ info->available_tags = nr_tags;
+ }
+
+ /* Last round, find available free tags, and which node need more tags. */
+ list_for_each_entry(info, &tags->ctl.head, node) {
+ unsigned int busy_count;
+
+ if (info->available_tags > nr_tags)
+ borrowed_tags += info->available_tags - nr_tags;
+ else
+ free_tags += info->available_tags - max(step,
+ (unsigned int)atomic_read(&info->active_tags));
+
+ busy_count = atomic_read(&info->busy_count);
+ if (busy_count > tags->nr_tags && busy_count > max_busy_count) {
+ busy = info;
+ max_busy_count = busy_count;
+ }
+ }
+
+ /* Borrow tags. */
+ if (busy && borrowed_tags < free_tags)
+ busy->available_tags += min(free_tags - borrowed_tags, step);
+
+out:
+ if (!busy) {
+ ctl->timer_running = false;
+ } else {
+ ctl->timer.expires = jiffies + HZ;
+ add_timer(&ctl->timer);
+ }
+ spin_unlock_irq(&tags->lock);
+}
+
void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
struct blk_mq_tags *tags = hctx->tags;
struct shared_tag_info *info;
+ bool timer_running = false;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;

+ info = &q->shared_tag_info;
if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
- return;
-
- info = &q->shared_tag_info;
+ goto inc_busy;
} else {
+ info = &hctx->shared_tag_info;
if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
- return;
-
- info = &hctx->shared_tag_info;
+ goto inc_busy;
}

spin_lock_irq(&tags->lock);
@@ -172,6 +240,15 @@ void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
blk_mq_update_wake_batch(tags, users);
WRITE_ONCE(tags->ctl.busy_queues, users);
spin_unlock_irq(&tags->lock);
+ return;
+
+inc_busy:
+ atomic_inc(&info->busy_count);
+ if (!tags->ctl.timer_running &&
+ try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running, true)) {
+ tags->ctl.timer.expires = jiffies + HZ;
+ add_timer(&tags->ctl.timer);
+ }
}

void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
@@ -204,6 +281,7 @@ void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
info = &hctx->shared_tag_info;
}

+ atomic_set(&info->busy_count, 0);
list_del_init(&info->node);
users = tags->ctl.busy_queues - 1;
blk_mq_update_available_driver_tags(tags, info, users);
@@ -705,6 +783,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
INIT_LIST_HEAD(&tags->ctl.head);
+ timer_setup(&tags->ctl.timer, tag_sharing_ctl_timer_fn, 0);

if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
total_tags, reserved_tags, node,
@@ -717,6 +796,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,

void blk_mq_free_tags(struct blk_mq_tags *tags)
{
+ del_timer_sync(&tags->ctl.timer);
sbitmap_queue_free(&tags->bitmap_tags);
sbitmap_queue_free(&tags->breserved_tags);
kfree(tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9182ceca8c7a..1e4aa733e1ab 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -736,6 +736,9 @@ struct tag_sharing_ctl {
* a list with request_queue/hctx->shared_tag_info.node entries.
*/
struct list_head head;
+ bool timer_running;
+ /* Start when any queue/hctx failed to get driver tag. */
+ struct timer_list timer;
};

/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8fd6a0a92233..8bfae877dd27 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -378,6 +378,7 @@ struct blk_independent_access_ranges {
struct shared_tag_info {
atomic_t active_tags;
unsigned int available_tags;
+ atomic_t busy_count;
struct list_head node;
};

--
2.39.2

2023-10-21 07:53:25

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC v2 1/8] blk-mq: factor out a structure from blk_mq_tags

From: Yu Kuai <[email protected]>

Currently tags are fairly shared. The new structure contains only one
field active_queues for now.

There are no functional changes and this patch prepares for refactoring
tag sharing.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 2 +-
block/blk-mq-tag.c | 8 ++++----
block/blk-mq.h | 2 +-
include/linux/blk-mq.h | 10 ++++++++--
4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5cbeb9344f2f..f6b77289bc1f 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -400,7 +400,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
seq_printf(m, "active_queues=%d\n",
- READ_ONCE(tags->active_queues));
+ READ_ONCE(tags->ctl.active_queues));

seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cc57e2dd9a0b..fe41a0d34fc0 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -57,8 +57,8 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->active_queues + 1;
- WRITE_ONCE(tags->active_queues, users);
+ users = tags->ctl.active_queues + 1;
+ WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);
spin_unlock_irq(&tags->lock);
}
@@ -94,8 +94,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->active_queues - 1;
- WRITE_ONCE(tags->active_queues, users);
+ users = tags->ctl.active_queues - 1;
+ WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);
spin_unlock_irq(&tags->lock);

diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..363c8f6e13dd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -435,7 +435,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return true;
}

- users = READ_ONCE(hctx->tags->active_queues);
+ users = READ_ONCE(hctx->tags->ctl.active_queues);
if (!users)
return true;

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1ab3081c82ed..5a08527c53b9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -727,13 +727,16 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
blk_opf_t opf, blk_mq_req_flags_t flags,
unsigned int hctx_idx);

+struct tag_sharing_ctl {
+ unsigned int active_queues;
+};
+
/*
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
- unsigned int active_queues;

struct sbitmap_queue bitmap_tags;
struct sbitmap_queue breserved_tags;
@@ -744,9 +747,12 @@ struct blk_mq_tags {

/*
* used to clear request reference in rqs[] before freeing one
- * request pool
+ * request pool, and to protect tag_sharing_ctl.
*/
spinlock_t lock;
+
+ /* used when tags is shared for multiple request_queue or hctx. */
+ struct tag_sharing_ctl ctl;
};

static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
--
2.39.2

2023-10-23 04:39:16

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH RFC v2 0/8] blk-mq: improve tag fair sharing

Hello Yu Kuai,

On Sat, Oct 21, 2023 at 11:47:58PM +0800, Yu Kuai wrote:
> From: Yu Kuai <[email protected]>
>
> Current implementation:
> - a counter active_queues record how many queue/hctx is sharing tags,
> and it's updated while issue new IO, and cleared in
> blk_mq_timeout_work().
> - if active_queues is more than 1, then tags is fair shared to each
> node;

Can you explain a bit what the problem is in current tag sharing?
And what is your basic approach for this problem?

Just mentioning the implementation is not too helpful for initial
review, cause the problem and approach(correctness) need to be
understood first.

Thanks,
Ming

2023-10-23 07:27:09

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC v2 0/8] blk-mq: improve tag fair sharing

Hi,

?? 2023/10/23 12:38, Ming Lei д??:
> Hello Yu Kuai,
>
> On Sat, Oct 21, 2023 at 11:47:58PM +0800, Yu Kuai wrote:
>> From: Yu Kuai <[email protected]>
>>
>> Current implementation:
>> - a counter active_queues record how many queue/hctx is sharing tags,
>> and it's updated while issue new IO, and cleared in
>> blk_mq_timeout_work().
>> - if active_queues is more than 1, then tags is fair shared to each
>> node;
>
> Can you explain a bit what the problem is in current tag sharing?
> And what is your basic approach for this problem?
>
> Just mentioning the implementation is not too helpful for initial
> review, cause the problem and approach(correctness) need to be
> understood first.

Of course, I'll add following if there will be a v3;

Current problems:

If there are multiple active_queues, then tag is fair shared to each
queue, and if one queue is not busy(for example, only issue one IO once
for a while), then shared tags for this queue is wasted and can't be
used for other queues.

Depends on the hardware, this might casue performance problems in some
user case. For example, as reported by [1], UFS devices
have multiple logical units. One of these logical units (WLUN) is used
to submit control commands, e.g. START STOP UNIT. If any request is
submitted to the WLUN, the queue depth is reduced from 31 to 15 or
lower for data LUNs.

This patchset first delay tag sharing from issue IO to failed to get
driver tag; then add a counter to record how many times shared queue
failed to get driver tag to indicate if the queue is busy; finially,
allow busy queue to borrow more tags from idle queue.

Thanks,
Kuai

>
> Thanks,
> Ming
>
> .
>

2023-10-23 20:47:19

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC v2 8/8] blk-mq-tag: allow shared queue/hctx to get more driver tags

On 10/21/23 08:48, Yu Kuai wrote:
> + if (!busy) {
> + ctl->timer_running = false;
> + } else {
> + ctl->timer.expires = jiffies + HZ;
> + add_timer(&ctl->timer);
> + }
> + spin_unlock_irq(&tags->lock);

[ ... ]

> +inc_busy:
> + atomic_inc(&info->busy_count);
> + if (!tags->ctl.timer_running &&
> + try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running, true)) {
> + tags->ctl.timer.expires = jiffies + HZ;
> + add_timer(&tags->ctl.timer);
> + }
> }

So the choice has been made to let the timer expire after one second? I
think the choice of the timer value is important enough to mention it in
the patch description.

Thanks,

Bart.

2023-10-24 01:08:01

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC v2 8/8] blk-mq-tag: allow shared queue/hctx to get more driver tags

Hi,

在 2023/10/24 4:46, Bart Van Assche 写道:
> On 10/21/23 08:48, Yu Kuai wrote:
>> +    if (!busy) {
>> +        ctl->timer_running = false;
>> +    } else {
>> +        ctl->timer.expires = jiffies + HZ;
>> +        add_timer(&ctl->timer);
>> +    }
>> +    spin_unlock_irq(&tags->lock);
>
> [ ... ]
>
>> +inc_busy:
>> +    atomic_inc(&info->busy_count);
>> +    if (!tags->ctl.timer_running &&
>> +        try_cmpxchg_relaxed(&tags->ctl.timer_running, &timer_running,
>> true)) {
>> +        tags->ctl.timer.expires = jiffies + HZ;
>> +        add_timer(&tags->ctl.timer);
>> +    }
>>   }
>
> So the choice has been made to let the timer expire after one second? I
> think the choice of the timer value is important enough to mention it in
> the patch description.

Yes, I agree, I admit that I was not that cautious while choose 1 HZ in
this version, I'm not quite sure is this approch will be accepted yet,
while this is the best(simple and workable) way that I can think of.

Thanks,
Kuai

>
> Thanks,
>
> Bart.
>
>
> .
>