2023-06-18 08:10:35

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

From: Yu Kuai <[email protected]>

This is not a formal version and not fully tested, I send this RFC
because I want to make sure if people think doing this is meaningful,
before I spend too much time on this.

This patchset tries to refactor how tag is shared:
- patch 2 delay tag sharing from issue io to when get driver tag faild;
- patch 3 support to access which queues/hctxs is sharing tags through
blk_mq_tags;
- patch 4 move the caculation that how many tags is available from
hctx_may_queue() to when queue/hctx start or stop to sharing tags.
- patch 5 record new information that how many times fail to get driver
tag recently; And patch 7 use this to adjust available tags for each
shared queue.

Yu Kuai (7):
blk-mq: factor out a structure from blk_mq_tags to control tag sharing
blk-mq: delay tag fair sharing until fail to get driver tag
blk-mq: support to track active queues from blk_mq_tags
blk-mq: precalculate available tags for hctx_may_queue()
blk-mq: record the number of times fail to get driver tag while
sharing tags
blk-mq: move active request counter to struct tag_sharing
blk-mq: allow shared queue to get more driver tags

block/blk-core.c | 2 -
block/blk-mq-debugfs.c | 6 +-
block/blk-mq-tag.c | 154 ++++++++++++++++++++++++++++++++++++++---
block/blk-mq.c | 10 ++-
block/blk-mq.h | 39 ++++++-----
include/linux/blk-mq.h | 24 ++++---
include/linux/blkdev.h | 13 +++-
7 files changed, 201 insertions(+), 47 deletions(-)

--
2.39.2



2023-06-18 08:10:35

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 2/7] blk-mq: delay tag fair sharing until fail to get driver tag

From: Yu Kuai <[email protected]>

Start tag fair sharing when a device start to issue io will waste
resources, same number of tags will be assigned to each disk/hctx,
and such tags can't be used for other disk/hctx, which means a disk/hctx
can't use more than assinged tags even if there are still lots of tags
that is assinged to other disks are unused.

Add a new api blk_mq_driver_tag_busy(), it will be called when get
driver tag failed, and move tag sharing from blk_mq_tag_busy() to
blk_mq_driver_tag_busy().

This approch will work well if total tags are not exhausted, and follow
up patches will try to refactor how tag is shared to handle this case.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 4 ++-
block/blk-mq-tag.c | 60 ++++++++++++++++++++++++++++++++++--------
block/blk-mq.c | 4 ++-
block/blk-mq.h | 13 ++++++---
include/linux/blk-mq.h | 6 +++--
include/linux/blkdev.h | 1 +
6 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 431aaa3eb181..de5a911b07c2 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
{
seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
- seq_printf(m, "active_queues=%d\n",
+ seq_printf(m, "active_queues=%u\n",
READ_ONCE(tags->ctl.active_queues));
+ seq_printf(m, "share_queues=%u\n",
+ READ_ONCE(tags->ctl.share_queues));

seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index fe41a0d34fc0..1c2bde917195 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
users);
}

+void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_tags *tags = hctx->tags;
+
+ /*
+ * calling test_bit() prior to test_and_set_bit() is intentional,
+ * it avoids dirtying the cacheline if the queue is already active.
+ */
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
+ if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
+ return;
+ } else {
+ if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
+ return;
+ }
+
+ spin_lock_irq(&tags->lock);
+ WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
+ blk_mq_update_wake_batch(tags, tags->ctl.share_queues);
+ spin_unlock_irq(&tags->lock);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
*/
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- unsigned int users;
struct blk_mq_tags *tags = hctx->tags;

/*
@@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->ctl.active_queues + 1;
- WRITE_ONCE(tags->ctl.active_queues, users);
- blk_mq_update_wake_batch(tags, users);
+ WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1);
spin_unlock_irq(&tags->lock);
}

@@ -73,6 +96,14 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
sbitmap_queue_wake_all(&tags->breserved_tags);
}

+static void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+ if (blk_mq_is_shared_tags(hctx->flags))
+ clear_bit(QUEUE_FLAG_HCTX_BUSY, &hctx->queue->queue_flags);
+ else
+ clear_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state);
+}
+
/*
* If a previously busy queue goes inactive, potential waiters could now
* be allowed to queue. Wake them up and check.
@@ -80,7 +111,6 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
- unsigned int users;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -94,9 +124,10 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->ctl.active_queues - 1;
- WRITE_ONCE(tags->ctl.active_queues, users);
- blk_mq_update_wake_batch(tags, users);
+ __blk_mq_driver_tag_idle(hctx);
+ WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1);
+ WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
+ blk_mq_update_wake_batch(tags, tags->ctl.share_queues);
spin_unlock_irq(&tags->lock);

blk_mq_tag_wakeup_all(tags, false);
@@ -105,14 +136,21 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
+ int ret = BLK_MQ_NO_TAG;
+
if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
!hctx_may_queue(data->hctx, bt))
- return BLK_MQ_NO_TAG;
+ goto out;

+ /* shallow_depth is only used for elevator */
if (data->shallow_depth)
return sbitmap_queue_get_shallow(bt, data->shallow_depth);
- else
- return __sbitmap_queue_get(bt);
+
+ ret = __sbitmap_queue_get(bt);
+out:
+ if (ret == BLK_MQ_NO_TAG && !(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_driver_tag_busy(data->hctx);
+ return ret;
}

unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index da650a2c4ca1..171ee4ac97ef 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1753,8 +1753,10 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq)

bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
- if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) {
+ blk_mq_driver_tag_busy(hctx);
return false;
+ }

if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(rq->rq_flags & RQF_MQ_INFLIGHT)) {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ca1c13127868..01441a5e9910 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -193,8 +193,9 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
return sbq_wait_ptr(bt, &hctx->wait_index);
}

-void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
-void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx);

static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
@@ -208,6 +209,12 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
__blk_mq_tag_idle(hctx);
}

+static inline void blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_driver_tag_busy(hctx);
+}
+
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag)
{
@@ -412,7 +419,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return true;
}

- users = READ_ONCE(hctx->tags->ctl.active_queues);
+ users = READ_ONCE(hctx->tags->ctl.share_queues);
if (!users)
return true;

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8d2cd6b9d305..bc3ac22edb07 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -675,10 +675,11 @@ enum {

BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1,
- BLK_MQ_S_SCHED_RESTART = 2,
+ BLK_MQ_S_DTAG_BUSY = 2,
+ BLK_MQ_S_SCHED_RESTART = 3,

/* hw queue is inactive after all its CPUs become offline */
- BLK_MQ_S_INACTIVE = 3,
+ BLK_MQ_S_INACTIVE = 4,

BLK_MQ_MAX_DEPTH = 10240,

@@ -735,6 +736,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,

struct tag_sharing_ctl {
unsigned int active_queues;
+ unsigned int share_queues;
};

/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed44a997f629..0994707f6a68 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -546,6 +546,7 @@ struct request_queue {
#define QUEUE_FLAG_DAX 19 /* device supports DAX */
#define QUEUE_FLAG_STATS 20 /* track IO start and completion times */
#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */
+#define QUEUE_FLAG_HCTX_BUSY 23 /* at least one blk-mq hctx failed to get driver tag */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */
--
2.39.2


2023-06-18 08:10:41

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 3/7] blk-mq: support to track active queues from blk_mq_tags

From: Yu Kuai <[email protected]>

In order to refactor how tags is shared, it's necessary to acquire some
information for each disk/hctx, so that more tags can be assigned to the
one with higher pressure.

Prepare to refactor tag sharing.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-tag.c | 13 +++++++++++++
include/linux/blk-mq.h | 2 ++
include/linux/blkdev.h | 5 +++++
3 files changed, 20 insertions(+)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1c2bde917195..8c527e68d4e4 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -64,6 +64,7 @@ void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ struct tag_sharing *tag_sharing;

/*
* calling test_bit() prior to test_and_set_bit() is intentional,
@@ -75,13 +76,18 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return;
+
+ tag_sharing = &q->tag_sharing;
} else {
if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
+
+ tag_sharing = &hctx->tag_sharing;
}

spin_lock_irq(&tags->lock);
+ list_add(&tag_sharing->node, &tags->ctl.head);
WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1);
spin_unlock_irq(&tags->lock);
}
@@ -111,6 +117,7 @@ static void __blk_mq_driver_tag_idle(struct blk_mq_hw_ctx *hctx)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ struct tag_sharing *tag_sharing;

if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -118,12 +125,17 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
+
+ tag_sharing = &q->tag_sharing;
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
+
+ tag_sharing = &hctx->tag_sharing;
}

spin_lock_irq(&tags->lock);
+ list_del_init(&tag_sharing->node);
__blk_mq_driver_tag_idle(hctx);
WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1);
WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
@@ -619,6 +631,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
+ INIT_LIST_HEAD(&tags->ctl.head);

if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
total_tags, reserved_tags, node,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index bc3ac22edb07..639d618e6ca8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -390,6 +390,7 @@ struct blk_mq_hw_ctx {
* assigned when a request is dispatched from a hardware queue.
*/
struct blk_mq_tags *tags;
+ struct tag_sharing tag_sharing;
/**
* @sched_tags: Tags owned by I/O scheduler. If there is an I/O
* scheduler associated with a request queue, a tag is assigned when
@@ -737,6 +738,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct tag_sharing_ctl {
unsigned int active_queues;
unsigned int share_queues;
+ struct list_head head;
};

/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0994707f6a68..62f8fcc20c30 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -375,6 +375,10 @@ struct blk_independent_access_ranges {
struct blk_independent_access_range ia_range[];
};

+struct tag_sharing {
+ struct list_head node;
+};
+
struct request_queue {
struct request *last_merge;
struct elevator_queue *elevator;
@@ -513,6 +517,7 @@ struct request_queue {

struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
+ struct tag_sharing tag_sharing;

struct dentry *debugfs_dir;
struct dentry *sched_debugfs_dir;
--
2.39.2


2023-06-18 08:14:35

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 6/7] blk-mq: move active request counter to struct tag_sharing

From: Yu Kuai <[email protected]>

Now that there is a separate structure to control tag sharing, it make
sense to move such counter for tag sharing into this structure. There
are no functional changes.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-core.c | 2 --
block/blk-mq.c | 3 ++-
block/blk-mq.h | 22 +++++++++++-----------
include/linux/blk-mq.h | 6 ------
include/linux/blkdev.h | 3 +--
5 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 99d8b9812b18..f2077ee32a99 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -413,8 +413,6 @@ struct request_queue *blk_alloc_queue(int node_id)

q->node = node_id;

- atomic_set(&q->nr_active_requests_shared_tags, 0);
-
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 771802ff1d45..91020cd2f6bf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3661,7 +3661,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
goto free_hctx;

- atomic_set(&hctx->nr_active, 0);
+ atomic_set(&hctx->tag_sharing.active_tags, 0);
if (node == NUMA_NO_NODE)
node = set->numa_node;
hctx->numa_node = node;
@@ -4237,6 +4237,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,

q->nr_requests = set->queue_depth;
q->tag_sharing.available_tags = set->queue_depth;
+ atomic_set(&q->tag_sharing.active_tags, 0);

blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index fcfb040efbbd..c8923a8565b5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -281,18 +281,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_shared_tags(hctx->flags))
- atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
+ atomic_inc(&hctx->queue->tag_sharing.active_tags);
else
- atomic_inc(&hctx->nr_active);
+ atomic_inc(&hctx->tag_sharing.active_tags);
}

static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
int val)
{
if (blk_mq_is_shared_tags(hctx->flags))
- atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
+ atomic_sub(val, &hctx->queue->tag_sharing.active_tags);
else
- atomic_sub(val, &hctx->nr_active);
+ atomic_sub(val, &hctx->tag_sharing.active_tags);
}

static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
@@ -303,8 +303,8 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_shared_tags(hctx->flags))
- return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
- return atomic_read(&hctx->nr_active);
+ return atomic_read(&hctx->queue->tag_sharing.active_tags);
+ return atomic_read(&hctx->tag_sharing.active_tags);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
@@ -398,7 +398,7 @@ static inline void blk_mq_free_requests(struct list_head *list)
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *bt)
{
- unsigned int depth;
+ struct tag_sharing *tag_sharing;

if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return true;
@@ -415,15 +415,15 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;

- depth = READ_ONCE(q->tag_sharing.available_tags);
+ tag_sharing = &q->tag_sharing;
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
-
- depth = READ_ONCE(hctx->tag_sharing.available_tags);
+ tag_sharing = &hctx->tag_sharing;
}

- return __blk_mq_active_requests(hctx) < depth;
+ return atomic_read(&tag_sharing->active_tags) <
+ READ_ONCE(tag_sharing->available_tags);
}

/* run the code block in @dispatch_ops with rcu/srcu read lock held */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 639d618e6ca8..fdfa63b76136 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -408,12 +408,6 @@ struct blk_mq_hw_ctx {
/** @queue_num: Index of this hardware queue. */
unsigned int queue_num;

- /**
- * @nr_active: Number of active requests. Only used when a tag set is
- * shared across request queues.
- */
- atomic_t nr_active;
-
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
/** @cpuhp_dead: List to store request if some CPU die. */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3faaf5f6504..0d25e7d2a94c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -378,6 +378,7 @@ struct blk_independent_access_ranges {
struct tag_sharing {
struct list_head node;
unsigned int available_tags;
+ atomic_t active_tags;
atomic_t fail_count;
unsigned long period;
};
@@ -462,8 +463,6 @@ struct request_queue {
struct timer_list timeout;
struct work_struct timeout_work;

- atomic_t nr_active_requests_shared_tags;
-
struct blk_mq_tags *sched_shared_tags;

struct list_head icq_list;
--
2.39.2


2023-06-18 08:39:51

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 1/7] blk-mq: factor out a structure from blk_mq_tags to control tag sharing

From: Yu Kuai <[email protected]>

Currently tags is fair shared, and the new structure contains only one
field active_queues. There are no functional changes and prepare to
refactor tag sharing.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-debugfs.c | 2 +-
block/blk-mq-tag.c | 8 ++++----
block/blk-mq.h | 2 +-
include/linux/blk-mq.h | 10 ++++++++--
4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b5930106b2..431aaa3eb181 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -401,7 +401,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
seq_printf(m, "active_queues=%d\n",
- READ_ONCE(tags->active_queues));
+ READ_ONCE(tags->ctl.active_queues));

seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cc57e2dd9a0b..fe41a0d34fc0 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -57,8 +57,8 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->active_queues + 1;
- WRITE_ONCE(tags->active_queues, users);
+ users = tags->ctl.active_queues + 1;
+ WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);
spin_unlock_irq(&tags->lock);
}
@@ -94,8 +94,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
- users = tags->active_queues - 1;
- WRITE_ONCE(tags->active_queues, users);
+ users = tags->ctl.active_queues - 1;
+ WRITE_ONCE(tags->ctl.active_queues, users);
blk_mq_update_wake_batch(tags, users);
spin_unlock_irq(&tags->lock);

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1743857e0b01..ca1c13127868 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -412,7 +412,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return true;
}

- users = READ_ONCE(hctx->tags->active_queues);
+ users = READ_ONCE(hctx->tags->ctl.active_queues);
if (!users)
return true;

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f401067ac03a..8d2cd6b9d305 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -733,13 +733,16 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
blk_opf_t opf, blk_mq_req_flags_t flags,
unsigned int hctx_idx);

+struct tag_sharing_ctl {
+ unsigned int active_queues;
+};
+
/*
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
- unsigned int active_queues;

struct sbitmap_queue bitmap_tags;
struct sbitmap_queue breserved_tags;
@@ -750,9 +753,12 @@ struct blk_mq_tags {

/*
* used to clear request reference in rqs[] before freeing one
- * request pool
+ * request pool, and to protect tag_sharing_ctl.
*/
spinlock_t lock;
+
+ /* used when tags is shared for multiple request_queue or hctx. */
+ struct tag_sharing_ctl ctl;
};

static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
--
2.39.2


2023-06-18 08:55:54

by Yu Kuai

[permalink] [raw]
Subject: [PATCH RFC 5/7] blk-mq: record the number of times fail to get driver tag while sharing tags

From: Yu Kuai <[email protected]>

Add a atomic counter to record such times, such counter will be used to
adjust the number of tags assigned to active queues. And this counter will
degrade each seconds so that it will only represent io pressure
recently.

Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-mq-tag.c | 22 ++++++++++++++++++++--
include/linux/blkdev.h | 2 ++
2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e0137206c02b..5e5742c7277a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -45,6 +45,17 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
users);
}

+static void update_tag_sharing_busy(struct tag_sharing *tag_sharing)
+{
+ unsigned int count = atomic_inc_return(&tag_sharing->fail_count);
+ unsigned long last_period = READ_ONCE(tag_sharing->period);
+
+ if (time_after(jiffies, last_period + HZ) &&
+ cmpxchg_relaxed(&tag_sharing->period, last_period, jiffies) ==
+ last_period)
+ atomic_sub(count / 2, &tag_sharing->fail_count);
+}
+
void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
@@ -57,12 +68,16 @@ void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;

if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
- test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
+ test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags)) {
+ update_tag_sharing_busy(&q->tag_sharing);
return;
+ }
} else {
if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
- test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
+ test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state)) {
+ update_tag_sharing_busy(&hctx->tag_sharing);
return;
+ }
}

spin_lock_irq(&tags->lock);
@@ -152,8 +167,11 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
}

spin_lock_irq(&tags->lock);
+
list_del_init(&tag_sharing->node);
tag_sharing->available_tags = tags->nr_tags;
+ atomic_set(&tag_sharing->fail_count, 0);
+
__blk_mq_driver_tag_idle(hctx);
WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues - 1);
WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e5111bedfd8d..f3faaf5f6504 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -378,6 +378,8 @@ struct blk_independent_access_ranges {
struct tag_sharing {
struct list_head node;
unsigned int available_tags;
+ atomic_t fail_count;
+ unsigned long period;
};

struct request_queue {
--
2.39.2


2023-06-19 06:19:52

by Hannes Reinecke

[permalink] [raw]
Subject: Re: [PATCH RFC 2/7] blk-mq: delay tag fair sharing until fail to get driver tag

On 6/18/23 18:07, Yu Kuai wrote:
> From: Yu Kuai <[email protected]>
>
> Start tag fair sharing when a device start to issue io will waste
> resources, same number of tags will be assigned to each disk/hctx,
> and such tags can't be used for other disk/hctx, which means a disk/hctx
> can't use more than assinged tags even if there are still lots of tags
> that is assinged to other disks are unused.
>
> Add a new api blk_mq_driver_tag_busy(), it will be called when get
> driver tag failed, and move tag sharing from blk_mq_tag_busy() to
> blk_mq_driver_tag_busy().
>
> This approch will work well if total tags are not exhausted, and follow
> up patches will try to refactor how tag is shared to handle this case.
>
> Signed-off-by: Yu Kuai <[email protected]>
> ---
> block/blk-mq-debugfs.c | 4 ++-
> block/blk-mq-tag.c | 60 ++++++++++++++++++++++++++++++++++--------
> block/blk-mq.c | 4 ++-
> block/blk-mq.h | 13 ++++++---
> include/linux/blk-mq.h | 6 +++--
> include/linux/blkdev.h | 1 +
> 6 files changed, 70 insertions(+), 18 deletions(-)
>
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 431aaa3eb181..de5a911b07c2 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
> {
> seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
> seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
> - seq_printf(m, "active_queues=%d\n",
> + seq_printf(m, "active_queues=%u\n",
> READ_ONCE(tags->ctl.active_queues));
> + seq_printf(m, "share_queues=%u\n",
> + READ_ONCE(tags->ctl.share_queues));
>
> seq_puts(m, "\nbitmap_tags:\n");
> sbitmap_queue_show(&tags->bitmap_tags, m);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index fe41a0d34fc0..1c2bde917195 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
> users);
> }
>
> +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
> +{
> + struct blk_mq_tags *tags = hctx->tags;
> +
> + /*
> + * calling test_bit() prior to test_and_set_bit() is intentional,
> + * it avoids dirtying the cacheline if the queue is already active.
> + */
> + if (blk_mq_is_shared_tags(hctx->flags)) {
> + struct request_queue *q = hctx->queue;
> +
> + if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
> + test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
> + return;
> + } else {
> + if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
> + test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
> + return;
> + }
> +
> + spin_lock_irq(&tags->lock);
> + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
> + blk_mq_update_wake_batch(tags, tags->ctl.share_queues);
> + spin_unlock_irq(&tags->lock);
> +}
> +
> /*
> * If a previously inactive queue goes active, bump the active user count.
> * We need to do this before try to allocate driver tag, then even if fail
> @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
> */
> void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> {
> - unsigned int users;
> struct blk_mq_tags *tags = hctx->tags;
>
> /*
> @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> }
>
> spin_lock_irq(&tags->lock);
> - users = tags->ctl.active_queues + 1;
> - WRITE_ONCE(tags->ctl.active_queues, users);
> - blk_mq_update_wake_batch(tags, users);
> + WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1);

Why did you remove the call to blk_mq_update_wake_batch() here?

Cheers,

Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman


2023-06-19 06:54:03

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC 2/7] blk-mq: delay tag fair sharing until fail to get driver tag

Hi,

在 2023/06/19 13:55, Hannes Reinecke 写道:
> On 6/18/23 18:07, Yu Kuai wrote:
>> From: Yu Kuai <[email protected]>
>>
>> Start tag fair sharing when a device start to issue io will waste
>> resources, same number of tags will be assigned to each disk/hctx,
>> and such tags can't be used for other disk/hctx, which means a disk/hctx
>> can't use more than assinged tags even if there are still lots of tags
>> that is assinged to other disks are unused.
>>
>> Add a new api blk_mq_driver_tag_busy(), it will be called when get
>> driver tag failed, and move tag sharing from blk_mq_tag_busy() to
>> blk_mq_driver_tag_busy().
>>
>> This approch will work well if total tags are not exhausted, and follow
>> up patches will try to refactor how tag is shared to handle this case.
>>
>> Signed-off-by: Yu Kuai <[email protected]>
>> ---
>>   block/blk-mq-debugfs.c |  4 ++-
>>   block/blk-mq-tag.c     | 60 ++++++++++++++++++++++++++++++++++--------
>>   block/blk-mq.c         |  4 ++-
>>   block/blk-mq.h         | 13 ++++++---
>>   include/linux/blk-mq.h |  6 +++--
>>   include/linux/blkdev.h |  1 +
>>   6 files changed, 70 insertions(+), 18 deletions(-)
>>
>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
>> index 431aaa3eb181..de5a911b07c2 100644
>> --- a/block/blk-mq-debugfs.c
>> +++ b/block/blk-mq-debugfs.c
>> @@ -400,8 +400,10 @@ static void blk_mq_debugfs_tags_show(struct
>> seq_file *m,
>>   {
>>       seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
>>       seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
>> -    seq_printf(m, "active_queues=%d\n",
>> +    seq_printf(m, "active_queues=%u\n",
>>              READ_ONCE(tags->ctl.active_queues));
>> +    seq_printf(m, "share_queues=%u\n",
>> +           READ_ONCE(tags->ctl.share_queues));
>>       seq_puts(m, "\nbitmap_tags:\n");
>>       sbitmap_queue_show(&tags->bitmap_tags, m);
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index fe41a0d34fc0..1c2bde917195 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -29,6 +29,32 @@ static void blk_mq_update_wake_batch(struct
>> blk_mq_tags *tags,
>>               users);
>>   }
>> +void __blk_mq_driver_tag_busy(struct blk_mq_hw_ctx *hctx)
>> +{
>> +    struct blk_mq_tags *tags = hctx->tags;
>> +
>> +    /*
>> +     * calling test_bit() prior to test_and_set_bit() is intentional,
>> +     * it avoids dirtying the cacheline if the queue is already active.
>> +     */
>> +    if (blk_mq_is_shared_tags(hctx->flags)) {
>> +        struct request_queue *q = hctx->queue;
>> +
>> +        if (test_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags) ||
>> +            test_and_set_bit(QUEUE_FLAG_HCTX_BUSY, &q->queue_flags))
>> +            return;
>> +    } else {
>> +        if (test_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state) ||
>> +            test_and_set_bit(BLK_MQ_S_DTAG_BUSY, &hctx->state))
>> +            return;
>> +    }
>> +
>> +    spin_lock_irq(&tags->lock);
>> +    WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
>> +    blk_mq_update_wake_batch(tags, tags->ctl.share_queues);
>> +    spin_unlock_irq(&tags->lock);
>> +}
>> +
>>   /*
>>    * If a previously inactive queue goes active, bump the active user
>> count.
>>    * We need to do this before try to allocate driver tag, then even
>> if fail
>> @@ -37,7 +63,6 @@ static void blk_mq_update_wake_batch(struct
>> blk_mq_tags *tags,
>>    */
>>   void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>>   {
>> -    unsigned int users;
>>       struct blk_mq_tags *tags = hctx->tags;
>>       /*
>> @@ -57,9 +82,7 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
>>       }
>>       spin_lock_irq(&tags->lock);
>> -    users = tags->ctl.active_queues + 1;
>> -    WRITE_ONCE(tags->ctl.active_queues, users);
>> -    blk_mq_update_wake_batch(tags, users);
>> +    WRITE_ONCE(tags->ctl.active_queues, tags->ctl.active_queues + 1);
>
> Why did you remove the call to blk_mq_update_wake_batch() here?

blk_mq_update_wake_batch() should be called when the available tags is
changed, however, active_queues is no longer used for hctx_may_queue()
to calculate available tags, share_queues is used instead and it's
updated in the new helper blk_mq_driver_tag_busy().

Thanks,
Kuai
>
> Cheers,
>
> Hannes


2023-06-20 15:58:45

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

On 6/18/23 09:07, Yu Kuai wrote:
> This is not a formal version and not fully tested, I send this RFC
> because I want to make sure if people think doing this is meaningful,
> before I spend too much time on this.
The approach looks good to me but I'd like to hear from Jens and
Christoph what their opinion is about the approach of this patch series
before doing an in-depth review.

Thanks,

Bart.

2023-07-03 13:54:10

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

Hi, Christoph and Jens and anyone who's interested

在 2023/06/20 23:20, Bart Van Assche 写道:
> On 6/18/23 09:07, Yu Kuai wrote:
>> This is not a formal version and not fully tested, I send this RFC
>> because I want to make sure if people think doing this is meaningful,
>> before I spend too much time on this.
> The approach looks good to me but I'd like to hear from Jens and
> Christoph what their opinion is about the approach of this patch series
> before doing an in-depth review.
>
Any suggestions on this topic? It'll be great to hear that if anyone
thinks it's meaningful to refactor tag fair sharing.

Thanks,
Kuai
> Thanks,
>
> Bart.
>
> .
>


2023-07-03 18:16:21

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

On 7/3/23 06:29, Yu Kuai wrote:
> 在 2023/06/20 23:20, Bart Van Assche 写道:
>> On 6/18/23 09:07, Yu Kuai wrote:
>>> This is not a formal version and not fully tested, I send this RFC
>>> because I want to make sure if people think doing this is meaningful,
>>> before I spend too much time on this.
>> The approach looks good to me but I'd like to hear from Jens and
>> Christoph what their opinion is about the approach of this patch
>> series before doing an in-depth review.
>>
> Any suggestions on this topic? It'll be great to hear that if anyone
> thinks it's meaningful to refactor tag fair sharing.

The cover letter of this patch series says "This is not a formal version
and not fully tested". If a fully tested version will be posted, I will
help with an in-depth review.

Thanks,

Bart.


2023-07-05 03:50:58

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

Hi,

在 2023/07/04 2:08, Bart Van Assche 写道:
> On 7/3/23 06:29, Yu Kuai wrote:
>> 在 2023/06/20 23:20, Bart Van Assche 写道:
>>> On 6/18/23 09:07, Yu Kuai wrote:
>>>> This is not a formal version and not fully tested, I send this RFC
>>>> because I want to make sure if people think doing this is meaningful,
>>>> before I spend too much time on this.
>>> The approach looks good to me but I'd like to hear from Jens and
>>> Christoph what their opinion is about the approach of this patch
>>> series before doing an in-depth review.
>>>
>> Any suggestions on this topic? It'll be great to hear that if anyone
>> thinks it's meaningful to refactor tag fair sharing.
>
> The cover letter of this patch series says "This is not a formal version
> and not fully tested". If a fully tested version will be posted, I will
> help with an in-depth review.

Thanks for taking time on this patchset, do you think I need to do
following for the formal version, or these improvements can be done
later?

- current algorithm to borrow tags in patch 7 is very easy and
straightforward, it should work fine on simple caces like the case you
reported for ufs, but this algorithm should be improved for more
complicated cases.
- currently borrowed tags will never be returned untill queue is idle,
I should figure out a way to return borrowed tags if this queue is not
busy, so that other queues can borrow tag from this queue.

Thanks,
Kuai

>
> Thanks,
>
> Bart.
>
>
> .
>


2023-07-06 18:03:03

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 1/7] blk-mq: factor out a structure from blk_mq_tags to control tag sharing

On 6/18/23 09:07, Yu Kuai wrote:
> Currently tags is fair shared, and the new structure contains only one
^^^^^^^ ^^^^^^^^^
are fairly . The
> field active_queues. There are no functional changes and prepare to
^^^^^^^^^^^^^^^
. This patch prepares for
> refactor tag sharing.
^^^^^^^^
refactoring

Please make the subject of this patch shorter, e.g. "Refactor struct
blk_mq_tags". Otherwise this patch looks good to me.

Thanks,

Bart.

2023-07-06 18:27:27

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 3/7] blk-mq: support to track active queues from blk_mq_tags

On 6/18/23 09:07, Yu Kuai wrote:
> @@ -737,6 +738,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> struct tag_sharing_ctl {
> unsigned int active_queues;
> unsigned int share_queues;
> + struct list_head head;
> };

Please add a comment that explains that 'head' is the head of a list with
tag_sharing.node entries.

> +struct tag_sharing {
> + struct list_head node;
> +};

Data structure names typically are a noun. Above I see a name that ends with a
verb. Would "shared_tag_info" or "tag_sharing_info" perhaps be a better name?

Thanks,

Bart.


2023-07-06 18:28:03

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 2/7] blk-mq: delay tag fair sharing until fail to get driver tag

On 6/18/23 09:07, Yu Kuai wrote:
> Start tag fair sharing when a device start to issue io will waste
> resources, same number of tags will be assigned to each disk/hctx,
> and such tags can't be used for other disk/hctx, which means a disk/hctx
> can't use more than assinged tags even if there are still lots of tags
^^^^^^^^
assigned
> that is assinged to other disks are unused.
^^^^^^^^
assigned
> Add a new api blk_mq_driver_tag_busy(), it will be called when get
> driver tag failed, and move tag sharing from blk_mq_tag_busy() to
> blk_mq_driver_tag_busy().


> + spin_lock_irq(&tags->lock);
> + WRITE_ONCE(tags->ctl.share_queues, tags->ctl.active_queues);
> + blk_mq_update_wake_batch(tags, tags->ctl.share_queues);
> + spin_unlock_irq(&tags->lock);
> +}

Are all tags->ctl.share_queues changes protected by tags->lock? If so, please
use a regular assignment to update that member variable instead of using
WRITE_ONCE().

> @@ -735,6 +736,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>
> struct tag_sharing_ctl {
> unsigned int active_queues;
> + unsigned int share_queues;
> };

Please rename "share_queues" into "shared_queues" such that the names of
both struct members start with an adjective.

Thanks,

Bart.

2023-07-06 18:56:50

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

On 7/4/23 20:17, Yu Kuai wrote:
> - currently borrowed tags will never be returned untill queue is idle,
> I should figure out a way to return borrowed tags if this queue is not
> busy, so that other queues can borrow tag from this queue.

At least for UFS this is a significant disadvantage. If e.g. one SCSI
command is sent to the UFS WLUN every 20 seconds and the request queue
timeout is 30 seconds then borrowed tags will never be returned.

The complexity of this patch series is a concern to me. The complexity
of this patch series may be a barrier towards merging this patch series
in the upstream kernel.

Thanks,

Bart.


2023-07-06 18:57:32

by Bart Van Assche

[permalink] [raw]
Subject: Re: [PATCH RFC 5/7] blk-mq: record the number of times fail to get driver tag while sharing tags

On 6/18/23 09:07, Yu Kuai wrote:
> +static void update_tag_sharing_busy(struct tag_sharing *tag_sharing)
> +{
> + unsigned int count = atomic_inc_return(&tag_sharing->fail_count);
> + unsigned long last_period = READ_ONCE(tag_sharing->period);
> +
> + if (time_after(jiffies, last_period + HZ) &&
> + cmpxchg_relaxed(&tag_sharing->period, last_period, jiffies) ==
> + last_period)
> + atomic_sub(count / 2, &tag_sharing->fail_count);
> +}

For new code, try_cmpxchg_relaxed() is preferred over cmpxchg_relaxed().

> struct tag_sharing {
> struct list_head node;
> unsigned int available_tags;
> + atomic_t fail_count;
> + unsigned long period;
> };

Please consider renaming "period" into "latest_reduction" or any other name
that make the purpose of this member clear.

Thanks,

Bart.


2023-07-07 02:19:34

by Yu Kuai

[permalink] [raw]
Subject: Re: [PATCH RFC 0/7] blk-mq: improve tag fair sharing

Hi,

在 2023/07/07 2:43, Bart Van Assche 写道:
> On 7/4/23 20:17, Yu Kuai wrote:
>> - currently borrowed tags will never be returned untill queue is idle,
>> I should figure out a way to return borrowed tags if this queue is not
>> busy, so that other queues can borrow tag from this queue.
>
> At least for UFS this is a significant disadvantage. If e.g. one SCSI
> command is sent to the UFS WLUN every 20 seconds and the request queue
> timeout is 30 seconds then borrowed tags will never be returned.

Ok, thanks for the notice.
>
> The complexity of this patch series is a concern to me. The complexity
> of this patch series may be a barrier towards merging this patch series
> in the upstream kernel.

Yeah, I see. Thanks for the review! However, all I have in mind will end
up make this patch series more complicated, I'll try my best to make the
code more readable in the next version.

Thanks,
Kuai
>
> Thanks,
>
> Bart.
>
>
> .
>