From: Yu Kuai <[email protected]>
Hi, all!
I'm planning to support build all blk-throttle polices as kernel module,
this is the first part for blk-throttle:
- patch 1 remove THROTTLE LOW;
- patch 2 delay initialization from disk initialization to
configuration;
- patch 3-5 support to destroy blk-throttle is it's disabled;
- patch 6 switch blk-throttle to use rq_qos to stop exposing
blk-throttle internal implementations;
Changes from RFV v1:
- add missing places in patch 1;
- add patch 2-6;
Yu Kuai (6):
blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW
blk-throttle: delay initialization until configuration
blk-throttle: expand tg_conf_updated() to return if any tg has rules
blk-cgroup: add a new helper blkg_conf_exit_blkg()
blk-throttle: support to destroy throtl_data when blk-throttle is
disabled
blk-throtl: switch to use rq_qos
Documentation/ABI/stable/sysfs-block | 12 -
arch/loongarch/configs/loongson3_defconfig | 1 -
block/Kconfig | 11 -
block/bio.c | 1 -
block/blk-cgroup.c | 25 +-
block/blk-cgroup.h | 1 +
block/blk-core.c | 4 +-
block/blk-merge.c | 2 +-
block/blk-mq-debugfs.c | 2 +
block/blk-rq-qos.c | 13 +
block/blk-rq-qos.h | 11 +
block/blk-stat.c | 3 -
block/blk-sysfs.c | 9 -
block/blk-throttle.c | 1598 ++++++--------------
block/blk-throttle.h | 76 +-
block/blk.h | 11 -
block/genhd.c | 3 -
include/linux/blkdev.h | 4 -
18 files changed, 495 insertions(+), 1292 deletions(-)
--
2.39.2
From: Yu Kuai <[email protected]>
The new helper just release the spin_lock 'queue_lock' and keep the
mutex 'rq_qos_mutex' held, to allow blk_throttle and other cgroup
policies to be destroyed when they are disabled by user.
Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-cgroup.c | 17 +++++++++++++++++
block/blk-cgroup.h | 1 +
2 files changed, 18 insertions(+)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b5e626a16758..ada9258f78bc 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -949,6 +949,23 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);
+/*
+ * blkg_conf_exit_blkg - like blkg_conf_exit() but only release queue_lock.
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
+ *
+ * This function must be called on all blkg_conf_ctx's initialized with
+ * blkg_conf_init().
+ */
+void blkg_conf_exit_blkg(struct blkg_conf_ctx *ctx)
+ __releases(&ctx->bdev->bd_queue->queue_lock)
+{
+ if (ctx->blkg) {
+ spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
+ ctx->blkg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(blkg_conf_exit_blkg);
+
static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
int i;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 78b74106bf10..009f96191e71 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -219,6 +219,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+void blkg_conf_exit_blkg(struct blkg_conf_ctx *ctx);
/**
* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
--
2.39.2
From: Yu Kuai <[email protected]>
Currently once blk-throttle is enabled, it can't be destroyed until disk
removal, even it's disabled.
Also prepare to support building it as kernel module.
Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-throttle.c | 65 +++++++++++++++++++++++++++++---------------
1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b371442131fe..5c16be07a594 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -28,6 +28,7 @@
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
+static DECLARE_WAIT_QUEUE_HEAD(destroy_wait);
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
@@ -906,6 +907,11 @@ static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
}
+static bool td_has_io(struct throtl_data *td)
+{
+ return td->nr_queued[READ] + td->nr_queued[WRITE] != 0;
+}
+
static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
@@ -941,6 +947,8 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
+ if (!td_has_io(tg->td))
+ wake_up(&destroy_wait);
}
throtl_trim_slice(tg, rw);
@@ -1268,6 +1276,31 @@ static int blk_throtl_init(struct gendisk *disk)
return ret;
}
+void blk_throtl_exit(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ if (!q->td)
+ return;
+
+ del_timer_sync(&q->td->service_queue.pending_timer);
+ cancel_work_sync(&q->td->dispatch_work);
+ blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
+ kfree(q->td);
+ q->td = NULL;
+}
+
+static void blk_throtl_destroy(struct gendisk *disk)
+{
+ struct throtl_data *td = disk->queue->td;
+
+ /*
+ * There are no rules, all throttled BIO should be dispatched
+ * immediately.
+ */
+ wait_event(destroy_wait, !td_has_io(td));
+ blk_throtl_exit(disk);
+}
static ssize_t tg_set_conf(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off, bool is_u64)
@@ -1308,7 +1341,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
else
*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
- tg_conf_updated(tg, false);
+ blkg_conf_exit_blkg(&ctx);
+
+ if (!tg_conf_updated(tg, false))
+ blk_throtl_destroy(ctx.bdev->bd_disk);
+
ret = 0;
out_finish:
blkg_conf_exit(&ctx);
@@ -1516,7 +1553,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg->iops[READ] = v[2];
tg->iops[WRITE] = v[3];
- tg_conf_updated(tg, false);
+ blkg_conf_exit_blkg(&ctx);
+
+ if (!tg_conf_updated(tg, false))
+ blk_throtl_destroy(ctx.bdev->bd_disk);
+
ret = 0;
out_finish:
blkg_conf_exit(&ctx);
@@ -1533,13 +1574,6 @@ static struct cftype throtl_files[] = {
{ } /* terminate */
};
-static void throtl_shutdown_wq(struct request_queue *q)
-{
- struct throtl_data *td = q->td;
-
- cancel_work_sync(&td->dispatch_work);
-}
-
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -1688,19 +1722,6 @@ bool __blk_throtl_bio(struct bio *bio)
return throttled;
}
-void blk_throtl_exit(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
-
- if (!q->td)
- return;
-
- del_timer_sync(&q->td->service_queue.pending_timer);
- throtl_shutdown_wq(q);
- blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- kfree(q->td);
-}
-
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
--
2.39.2
From: Yu Kuai <[email protected]>
To avoid exposing blk-throttle internal implementation to general block
layer.
Signed-off-by: Yu Kuai <[email protected]>
---
block/blk-cgroup.c | 2 -
block/blk-core.c | 4 +-
block/blk-merge.c | 2 +-
block/blk-mq-debugfs.c | 2 +
block/blk-rq-qos.c | 13 ++
block/blk-rq-qos.h | 11 ++
block/blk-sysfs.c | 1 -
block/blk-throttle.c | 387 +++++++++++++++++++++++------------------
block/blk-throttle.h | 50 ------
block/genhd.c | 3 -
include/linux/blkdev.h | 4 -
11 files changed, 246 insertions(+), 233 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ada9258f78bc..0eede352ece1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,7 +32,6 @@
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
-#include "blk-throttle.h"
static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
@@ -1473,7 +1472,6 @@ int blkcg_init_disk(struct gendisk *disk)
void blkcg_exit_disk(struct gendisk *disk)
{
blkg_destroy_all(disk);
- blk_throtl_exit(disk);
}
static void blkcg_exit(struct task_struct *tsk)
diff --git a/block/blk-core.c b/block/blk-core.c
index a16b5abdbbf5..27ccc55bac06 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -48,7 +48,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-cgroup.h"
-#include "blk-throttle.h"
+#include "blk-rq-qos.h"
#include "blk-ioprio.h"
struct dentry *blk_debugfs_root;
@@ -832,7 +832,7 @@ void submit_bio_noacct(struct bio *bio)
goto not_supported;
}
- if (blk_throtl_bio(bio))
+ if (rq_qos_throttle_bio(q, bio))
return;
submit_bio_noacct_nocheck(bio);
return;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2a06fd33039d..eb08b2d91ec5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -14,9 +14,9 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-#include "blk-throttle.h"
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..b7613b811d86 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -756,6 +756,8 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
static const char *rq_qos_id_to_name(enum rq_qos_id id)
{
switch (id) {
+ case RQ_QOS_THROTTLE:
+ return "throttle";
case RQ_QOS_WBT:
return "wbt";
case RQ_QOS_LATENCY:
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index dd7310c94713..fc2fc4052708 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -59,6 +59,19 @@ void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
} while (rqos);
}
+bool __rq_qos_throttle_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ do {
+ if (rqos->ops->throttle_bio &&
+ rqos->ops->throttle_bio(rqos, bio))
+ return true;
+
+ rqos = rqos->next;
+ } while (rqos);
+
+ return false;
+}
+
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
{
do {
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..03d8daab8ed6 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -14,6 +14,7 @@
struct blk_mq_debugfs_attr;
enum rq_qos_id {
+ RQ_QOS_THROTTLE,
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
@@ -35,6 +36,7 @@ struct rq_qos {
};
struct rq_qos_ops {
+ bool (*throttle_bio)(struct rq_qos *, struct bio *);
void (*throttle)(struct rq_qos *, struct bio *);
void (*track)(struct rq_qos *, struct request *, struct bio *);
void (*merge)(struct rq_qos *, struct request *, struct bio *);
@@ -104,6 +106,7 @@ void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
+bool __rq_qos_throttle_bio(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
@@ -144,6 +147,14 @@ static inline void rq_qos_done_bio(struct bio *bio)
}
}
+static inline bool rq_qos_throttle_bio(struct request_queue *q, struct bio *bio)
+{
+ if (q->rq_qos)
+ return __rq_qos_throttle_bio(q->rq_qos, bio);
+
+ return false;
+}
+
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
if (q->rq_qos) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1e2a0b18360c..7f5ece4b8f9e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -18,7 +18,6 @@
#include "blk-rq-qos.h"
#include "blk-wbt.h"
#include "blk-cgroup.h"
-#include "blk-throttle.h"
struct queue_sysfs_entry {
struct attribute attr;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5c16be07a594..b1ff0640a39a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -11,6 +11,7 @@
#include <linux/bio.h>
#include <linux/blktrace_api.h>
#include "blk.h"
+#include "blk-rq-qos.h"
#include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
#include "blk-throttle.h"
@@ -45,8 +46,8 @@ struct avg_latency_bucket {
bool valid;
};
-struct throtl_data
-{
+struct throtl_data {
+ struct rq_qos rqos;
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
@@ -65,6 +66,19 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
+static struct throtl_data *rqos_to_td(struct rq_qos *rqos)
+{
+ if (!rqos)
+ return NULL;
+
+ return container_of(rqos, struct throtl_data, rqos);
+}
+
+static struct throtl_data *q_to_td(struct request_queue *q)
+{
+ return rqos_to_td(rq_qos_id(q, RQ_QOS_THROTTLE));
+}
+
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
@@ -293,7 +307,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td = blkg->q->td;
+ struct throtl_data *td = q_to_td(blkg->q);
struct throtl_service_queue *sq = &tg->service_queue;
/*
@@ -1230,6 +1244,192 @@ static bool tg_conf_updated(struct throtl_grp *tg, bool global)
return has_rules;
}
+static void blk_throtl_cancel_bios(struct request_queue *q)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ spin_lock_irq(&q->queue_lock);
+ /*
+ * queue_lock is held, rcu lock is not needed here technically.
+ * However, rcu lock is still held to emphasize that following
+ * path need RCU protection and to prevent warning from lockdep.
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ continue;
+
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&q->queue_lock);
+}
+
+static void blk_throtl_exit(struct rq_qos *rqos)
+{
+ struct throtl_data *td = rqos_to_td(rqos);
+
+ blk_throtl_cancel_bios(td->queue);
+ /*
+ * There are no rules, all throttled BIO should be dispatched
+ * immediately.
+ */
+ wait_event(destroy_wait, !td_has_io(td));
+
+ del_timer_sync(&td->service_queue.pending_timer);
+ cancel_work_sync(&td->dispatch_work);
+ blkcg_deactivate_policy(rqos->disk, &blkcg_policy_throtl);
+ kfree(td);
+}
+
+static bool blk_should_throtl(struct throtl_grp *tg, struct bio *bio)
+{
+ int rw = bio_data_dir(bio);
+
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
+ bio->bi_iter.bi_size);
+ }
+ blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
+ }
+
+ /* iops limit is always counted */
+ if (tg->has_rules_iops[rw])
+ return true;
+
+ if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
+ return true;
+
+ return false;
+}
+
+static bool __blk_throtl_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct throtl_qnode *qn = NULL;
+ struct throtl_service_queue *sq;
+ bool rw = bio_data_dir(bio);
+ bool throttled = false;
+ struct throtl_data *td = tg->td;
+
+ rcu_read_lock();
+ spin_lock_irq(&q->queue_lock);
+ sq = &tg->service_queue;
+
+ while (true) {
+ if (tg->last_low_overflow_time[rw] == 0)
+ tg->last_low_overflow_time[rw] = jiffies;
+ /* throtl is FIFO - if bios are already queued, should queue */
+ if (sq->nr_queued[rw])
+ break;
+
+ /* if above limits, break to queue */
+ if (!tg_may_dispatch(tg, bio, NULL)) {
+ tg->last_low_overflow_time[rw] = jiffies;
+ break;
+ }
+
+ /* within limits, let's charge and dispatch directly */
+ throtl_charge_bio(tg, bio);
+
+ /*
+ * We need to trim slice even when bios are not being queued
+ * otherwise it might happen that a bio is not queued for
+ * a long time and slice keeps on extending and trim is not
+ * called for a long time. Now if limits are reduced suddenly
+ * we take into account all the IO dispatched so far at new
+ * low rate and * newly queued IO gets a really long dispatch
+ * time.
+ *
+ * So keep on trimming slice even if bio is not queued.
+ */
+ throtl_trim_slice(tg, rw);
+
+ /*
+ * @bio passed through this layer without being throttled.
+ * Climb up the ladder. If we're already at the top, it
+ * can be executed directly.
+ */
+ qn = &tg->qnode_on_parent[rw];
+ sq = sq->parent_sq;
+ tg = sq_to_tg(sq);
+ if (!tg) {
+ bio_set_flag(bio, BIO_BPS_THROTTLED);
+ goto out_unlock;
+ }
+ }
+
+ /* out-of-limit, queue to @tg */
+ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
+ rw == READ ? 'R' : 'W',
+ tg->bytes_disp[rw], bio->bi_iter.bi_size,
+ tg_bps_limit(tg, rw),
+ tg->io_disp[rw], tg_iops_limit(tg, rw),
+ sq->nr_queued[READ], sq->nr_queued[WRITE]);
+
+ tg->last_low_overflow_time[rw] = jiffies;
+
+ td->nr_queued[rw]++;
+ throtl_add_bio_tg(bio, qn, tg);
+ throttled = true;
+
+ /*
+ * Update @tg's dispatch time and force schedule dispatch if @tg
+ * was empty before @bio. The forced scheduling isn't likely to
+ * cause undue delay as @bio is likely to be dispatched directly if
+ * its @tg's disptime is not in the future.
+ */
+ if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ tg_update_disptime(tg);
+ throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
+ }
+
+out_unlock:
+ spin_unlock_irq(&q->queue_lock);
+
+ rcu_read_unlock();
+ return throttled;
+}
+
+static bool blk_throtl_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+
+ if (!blk_should_throtl(tg, bio))
+ return false;
+
+ return __blk_throtl_bio(tg, bio);
+}
+
+static const struct rq_qos_ops throtl_rqos_ops = {
+ .throttle_bio = blk_throtl_bio,
+ .exit = blk_throtl_exit,
+};
static int blk_throtl_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -1243,20 +1443,21 @@ static int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- /*
- * freeze queue before setting q->td, so that IO path won't see
- * q->td while policy is not activated yet.
- */
blk_mq_freeze_queue(disk->queue);
blk_mq_quiesce_queue(disk->queue);
- q->td = td;
td->queue = q;
+ ret = rq_qos_add(&td->rqos, disk, RQ_QOS_THROTTLE, &throtl_rqos_ops);
+ if (ret) {
+ kfree(td);
+ goto out;
+ }
+
/* activate policy */
ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
if (ret) {
- q->td = NULL;
+ rq_qos_del(&td->rqos);
kfree(td);
goto out;
}
@@ -1276,30 +1477,14 @@ static int blk_throtl_init(struct gendisk *disk)
return ret;
}
-void blk_throtl_exit(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
-
- if (!q->td)
- return;
-
- del_timer_sync(&q->td->service_queue.pending_timer);
- cancel_work_sync(&q->td->dispatch_work);
- blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- kfree(q->td);
- q->td = NULL;
-}
-
static void blk_throtl_destroy(struct gendisk *disk)
{
- struct throtl_data *td = disk->queue->td;
+ struct throtl_data *td = q_to_td(disk->queue);
- /*
- * There are no rules, all throttled BIO should be dispatched
- * immediately.
- */
- wait_event(destroy_wait, !td_has_io(td));
- blk_throtl_exit(disk);
+ lockdep_assert_held(&td->queue->rq_qos_mutex);
+
+ rq_qos_del(&td->rqos);
+ blk_throtl_exit(&td->rqos);
}
static ssize_t tg_set_conf(struct kernfs_open_file *of,
@@ -1317,7 +1502,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
if (ret)
goto out_finish;
- if (!ctx.bdev->bd_queue->td) {
+ if (!q_to_td(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
goto out_finish;
@@ -1495,7 +1680,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
if (ret)
goto out_finish;
- if (!ctx.bdev->bd_queue->td) {
+ if (!q_to_td(ctx.bdev->bd_queue)) {
ret = blk_throtl_init(ctx.bdev->bd_disk);
if (ret)
goto out_finish;
@@ -1584,144 +1769,6 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free,
};
-void blk_throtl_cancel_bios(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- if (!q->td)
- return;
-
- spin_lock_irq(&q->queue_lock);
- /*
- * queue_lock is held, rcu lock is not needed here technically.
- * However, rcu lock is still held to emphasize that following
- * path need RCU protection and to prevent warning from lockdep.
- */
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
- /*
- * Set the flag to make sure throtl_pending_timer_fn() won't
- * stop until all throttled bios are dispatched.
- */
- tg->flags |= THROTL_TG_CANCELING;
-
- /*
- * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
- * will be inserted to service queue without THROTL_TG_PENDING
- * set in tg_update_disptime below. Then IO dispatched from
- * child in tg_dispatch_one_bio will trigger double insertion
- * and corrupt the tree.
- */
- if (!(tg->flags & THROTL_TG_PENDING))
- continue;
-
- /*
- * Update disptime after setting the above flag to make sure
- * throtl_select_dispatch() won't exit without dispatching.
- */
- tg_update_disptime(tg);
-
- throtl_schedule_pending_timer(sq, jiffies + 1);
- }
- rcu_read_unlock();
- spin_unlock_irq(&q->queue_lock);
-}
-
-bool __blk_throtl_bio(struct bio *bio)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blkcg_gq *blkg = bio->bi_blkg;
- struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq;
- bool rw = bio_data_dir(bio);
- bool throttled = false;
- struct throtl_data *td = tg->td;
-
- rcu_read_lock();
- spin_lock_irq(&q->queue_lock);
- sq = &tg->service_queue;
-
- while (true) {
- if (tg->last_low_overflow_time[rw] == 0)
- tg->last_low_overflow_time[rw] = jiffies;
- /* throtl is FIFO - if bios are already queued, should queue */
- if (sq->nr_queued[rw])
- break;
-
- /* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL)) {
- tg->last_low_overflow_time[rw] = jiffies;
- break;
- }
-
- /* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
-
- /*
- * We need to trim slice even when bios are not being queued
- * otherwise it might happen that a bio is not queued for
- * a long time and slice keeps on extending and trim is not
- * called for a long time. Now if limits are reduced suddenly
- * we take into account all the IO dispatched so far at new
- * low rate and * newly queued IO gets a really long dispatch
- * time.
- *
- * So keep on trimming slice even if bio is not queued.
- */
- throtl_trim_slice(tg, rw);
-
- /*
- * @bio passed through this layer without being throttled.
- * Climb up the ladder. If we're already at the top, it
- * can be executed directly.
- */
- qn = &tg->qnode_on_parent[rw];
- sq = sq->parent_sq;
- tg = sq_to_tg(sq);
- if (!tg) {
- bio_set_flag(bio, BIO_BPS_THROTTLED);
- goto out_unlock;
- }
- }
-
- /* out-of-limit, queue to @tg */
- throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
- rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_iter.bi_size,
- tg_bps_limit(tg, rw),
- tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
-
- tg->last_low_overflow_time[rw] = jiffies;
-
- td->nr_queued[rw]++;
- throtl_add_bio_tg(bio, qn, tg);
- throttled = true;
-
- /*
- * Update @tg's dispatch time and force schedule dispatch if @tg
- * was empty before @bio. The forced scheduling isn't likely to
- * cause undue delay as @bio is likely to be dispatched directly if
- * its @tg's disptime is not in the future.
- */
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
- tg_update_disptime(tg);
- throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
- }
-
-out_unlock:
- spin_unlock_irq(&q->queue_lock);
-
- rcu_read_unlock();
- return throttled;
-}
-
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index e7f5562a32e9..147940c4a7ca 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -146,54 +146,4 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
-/*
- * Internal throttling interface
- */
-#ifndef CONFIG_BLK_DEV_THROTTLING
-static inline void blk_throtl_exit(struct gendisk *disk) { }
-static inline bool blk_throtl_bio(struct bio *bio) { return false; }
-static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
-#else /* CONFIG_BLK_DEV_THROTTLING */
-void blk_throtl_exit(struct gendisk *disk);
-bool __blk_throtl_bio(struct bio *bio);
-void blk_throtl_cancel_bios(struct gendisk *disk);
-
-static inline bool blk_should_throtl(struct bio *bio)
-{
- struct throtl_grp *tg;
- int rw = bio_data_dir(bio);
-
- if (!bio->bi_bdev->bd_queue->td)
- return false;
-
- tg = blkg_to_tg(bio->bi_blkg);
- if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
- if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
- bio_set_flag(bio, BIO_CGROUP_ACCT);
- blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
- bio->bi_iter.bi_size);
- }
- blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
- }
-
- /* iops limit is always counted */
- if (tg->has_rules_iops[rw])
- return true;
-
- if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
- return true;
-
- return false;
-}
-
-static inline bool blk_throtl_bio(struct bio *bio)
-{
-
- if (!blk_should_throtl(bio))
- return false;
-
- return __blk_throtl_bio(bio);
-}
#endif /* CONFIG_BLK_DEV_THROTTLING */
-
-#endif
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..9bc55321e606 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -27,7 +27,6 @@
#include <linux/part_stat.h>
#include <linux/blktrace_api.h>
-#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
@@ -699,8 +698,6 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q);
- blk_throtl_cancel_bios(disk);
-
blk_sync_queue(q);
blk_flush_integrity();
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3e8f7cf96be..368f450f74f2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -487,10 +487,6 @@ struct request_queue {
int mq_freeze_depth;
-#ifdef CONFIG_BLK_DEV_THROTTLING
- /* Throttle data */
- struct throtl_data *td;
-#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
/*
--
2.39.2
From: Yu Kuai <[email protected]>
One the one hand, it's marked EXPERIMENTAL since 2017, and looks like
there are no users since then, and no testers and no developers, it's
just not active at all.
On the other hand, even if the config is disabled, there are still many
fields in throtl_grp and throtl_data and many functions that are only
used for throtl low.
At last, currently blk-throtl is initialized during disk initialization,
and destroyed during disk removal, and it exposes many functions to be
called directly from block layer. Hence I'm planning to optimize
blk-throtl and finially support building it as kernel module. Remove
throtl low will make the work much easier.
Signed-off-by: Yu Kuai <[email protected]>
---
Documentation/ABI/stable/sysfs-block | 12 -
arch/loongarch/configs/loongson3_defconfig | 1 -
block/Kconfig | 11 -
block/bio.c | 1 -
block/blk-stat.c | 3 -
block/blk-sysfs.c | 7 -
block/blk-throttle.c | 901 +--------------------
block/blk-throttle.h | 26 +-
block/blk.h | 11 -
9 files changed, 45 insertions(+), 928 deletions(-)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 1fe9a553c37b..31bbc3b20089 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -584,18 +584,6 @@ Description:
the data. If no such restriction exists, this file will contain
'0'. This file is writable for testing purposes.
-
-What: /sys/block/<disk>/queue/throttle_sample_time
-Date: March 2017
-Contact: [email protected]
-Description:
- [RW] This is the time window that blk-throttle samples data, in
- millisecond. blk-throttle makes decision based on the
- samplings. Lower time means cgroups have more smooth throughput,
- but higher CPU overhead. This exists only when
- CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
-
-
What: /sys/block/<disk>/queue/virt_boundary_mask
Date: April 2021
Contact: [email protected]
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index f18c2ba871ef..fc0d89d4c1c5 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -76,7 +76,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_MODVERSIONS=y
CONFIG_BLK_DEV_ZONED=y
CONFIG_BLK_DEV_THROTTLING=y
-CONFIG_BLK_DEV_THROTTLING_LOW=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_FC_APPID=y
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..72814e485f82 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -120,17 +120,6 @@ config BLK_DEV_THROTTLING
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
-config BLK_DEV_THROTTLING_LOW
- bool "Block throttling .low limit interface support (EXPERIMENTAL)"
- depends on BLK_DEV_THROTTLING
- help
- Add .low limit interface for block throttling. The low limit is a best
- effort limit to prioritize cgroups. Depending on the setting, the limit
- can be used to protect cgroups in terms of bandwidth/iops and better
- utilize disk resource.
-
- Note, this is an experimental interface and could be changed someday.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..4404cd0a2690 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1596,7 +1596,6 @@ void bio_endio(struct bio *bio)
goto again;
}
- blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
if (bio->bi_end_io)
diff --git a/block/blk-stat.c b/block/blk-stat.c
index e42c263e53fb..eaf60097bbe1 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now)
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
- if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
- blk_throtl_stat_add(rq, value);
-
rcu_read_lock();
cpu = get_cpu();
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..674c5c602364 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
-#endif
-
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = 0444 },
@@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = {
&queue_fua_entry.attr,
&queue_dax_entry.attr,
&queue_poll_delay_entry.attr,
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- &blk_throtl_sample_time_entry.attr,
-#endif
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
NULL,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860b..1e45e48564f4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,6 @@
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
#define MAX_THROTL_SLICE (HZ)
-#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
-#define MIN_THROTL_BPS (320 * 1024)
-#define MIN_THROTL_IOPS (10)
-#define DFL_LATENCY_TARGET (-1L)
-#define DFL_IDLE_THRESHOLD (0)
-#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
-#define LATENCY_FILTERED_SSD (0)
-/*
- * For HD, very small latency comes from sequential IO. Such IO is helpless to
- * help determine if its IO is impacted by others, hence we ignore the IO
- */
-#define LATENCY_FILTERED_HD (1000L) /* 1ms */
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
@@ -70,19 +58,6 @@ struct throtl_data
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
- unsigned int limit_index;
- bool limit_valid[LIMIT_CNT];
-
- unsigned long low_upgrade_time;
- unsigned long low_downgrade_time;
-
- unsigned int scale;
-
- struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets[2];
- unsigned long last_calculate_time;
- unsigned long filtered_latency;
bool track_bio_latency;
};
@@ -126,84 +101,24 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
-/*
- * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
- * make the IO dispatch more smooth.
- * Scale up: linearly scale up according to elapsed time since upgrade. For
- * every throtl_slice, the limit scales up 1/2 .low limit till the
- * limit hits .max limit
- * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
- */
-static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
-{
- /* arbitrary value to avoid too big scale */
- if (td->scale < 4096 && time_after_eq(jiffies,
- td->low_upgrade_time + td->scale * td->throtl_slice))
- td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
-
- return low + (low >> 1) * td->scale;
-}
-
static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- uint64_t ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return U64_MAX;
- td = tg->td;
- ret = tg->bps[rw][td->limit_index];
- if (ret == 0 && td->limit_index == LIMIT_LOW) {
- /* intermediate node or iops isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->iops[rw][td->limit_index])
- return U64_MAX;
- else
- return MIN_THROTL_BPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
- tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
- ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->bps[rw];
}
static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
{
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td;
- unsigned int ret;
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
return UINT_MAX;
- td = tg->td;
- ret = tg->iops[rw][td->limit_index];
- if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
- /* intermediate node or bps isn't 0 */
- if (!list_empty(&blkg->blkcg->css.children) ||
- tg->bps[rw][td->limit_index])
- return UINT_MAX;
- else
- return MIN_THROTL_IOPS;
- }
-
- if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
- tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
- uint64_t adjusted;
-
- adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
- if (adjusted > UINT_MAX)
- adjusted = UINT_MAX;
- ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
- }
- return ret;
+ return tg->iops[rw];
}
#define request_bucket_index(sectors) \
@@ -359,20 +274,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk,
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ][LIMIT_MAX] = U64_MAX;
- tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
- tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
- tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
- tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
- tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
- /* LIMIT_LOW will have default value 0 */
-
- tg->latency_target = DFL_LATENCY_TARGET;
- tg->latency_target_conf = DFL_LATENCY_TARGET;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
+ tg->bps[READ] = U64_MAX;
+ tg->bps[WRITE] = U64_MAX;
+ tg->iops[READ] = UINT_MAX;
+ tg->iops[WRITE] = UINT_MAX;
return &tg->pd;
@@ -418,18 +323,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
- struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++) {
tg->has_rules_iops[rw] =
(parent_tg && parent_tg->has_rules_iops[rw]) ||
- (td->limit_valid[td->limit_index] &&
- tg_iops_limit(tg, rw) != UINT_MAX);
+ tg_iops_limit(tg, rw) != UINT_MAX;
tg->has_rules_bps[rw] =
(parent_tg && parent_tg->has_rules_bps[rw]) ||
- (td->limit_valid[td->limit_index] &&
- (tg_bps_limit(tg, rw) != U64_MAX));
+ tg_bps_limit(tg, rw) != U64_MAX;
}
}
@@ -443,49 +345,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
tg_update_has_rules(tg);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
- bool low_valid = false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
- tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
- low_valid = true;
- break;
- }
- }
- rcu_read_unlock();
-
- td->limit_valid[LIMIT_LOW] = low_valid;
-}
-#else
-static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
-}
-#endif
-
-static void throtl_upgrade_state(struct throtl_data *td);
-static void throtl_pd_offline(struct blkg_policy_data *pd)
-{
- struct throtl_grp *tg = pd_to_tg(pd);
-
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
-
- blk_throtl_update_limit_valid(tg->td);
-
- if (!tg->td->limit_valid[tg->td->limit_index])
- throtl_upgrade_state(tg->td);
-}
-
static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
@@ -1151,8 +1010,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @t: the pending_timer member of the throtl_service_queue being serviced
@@ -1189,9 +1046,6 @@ static void throtl_pending_timer_fn(struct timer_list *t)
if (!q->root_blkg)
goto out_unlock;
- if (throtl_can_upgrade(td, NULL))
- throtl_upgrade_state(td);
-
again:
parent_sq = sq->parent_sq;
dispatched = false;
@@ -1339,14 +1193,6 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
!blkg->parent->parent)
continue;
parent_tg = blkg_to_tg(blkg->parent);
- /*
- * make sure all children has lower idle time threshold and
- * higher latency target
- */
- this_tg->idletime_threshold = min(this_tg->idletime_threshold,
- parent_tg->idletime_threshold);
- this_tg->latency_target = max(this_tg->latency_target,
- parent_tg->latency_target);
}
rcu_read_unlock();
@@ -1444,25 +1290,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[READ]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, bps[WRITE]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[READ]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
+ .private = offsetof(struct throtl_grp, iops[WRITE]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1497,58 +1343,30 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
char bufs[4][21] = { "max", "max", "max", "max" };
u64 bps_dft;
unsigned int iops_dft;
- char idle_time[26] = "";
- char latency_time[26] = "";
if (!dname)
return 0;
- if (off == LIMIT_LOW) {
- bps_dft = 0;
- iops_dft = 0;
- } else {
- bps_dft = U64_MAX;
- iops_dft = UINT_MAX;
- }
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
- if (tg->bps_conf[READ][off] == bps_dft &&
- tg->bps_conf[WRITE][off] == bps_dft &&
- tg->iops_conf[READ][off] == iops_dft &&
- tg->iops_conf[WRITE][off] == iops_dft &&
- (off != LIMIT_LOW ||
- (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
- tg->latency_target_conf == DFL_LATENCY_TARGET)))
+ if (tg->bps[READ] == bps_dft &&
+ tg->bps[WRITE] == bps_dft &&
+ tg->iops[READ] == iops_dft &&
+ tg->iops[WRITE] == iops_dft)
return 0;
- if (tg->bps_conf[READ][off] != U64_MAX)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu",
- tg->bps_conf[READ][off]);
- if (tg->bps_conf[WRITE][off] != U64_MAX)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu",
- tg->bps_conf[WRITE][off]);
- if (tg->iops_conf[READ][off] != UINT_MAX)
- snprintf(bufs[2], sizeof(bufs[2]), "%u",
- tg->iops_conf[READ][off]);
- if (tg->iops_conf[WRITE][off] != UINT_MAX)
- snprintf(bufs[3], sizeof(bufs[3]), "%u",
- tg->iops_conf[WRITE][off]);
- if (off == LIMIT_LOW) {
- if (tg->idletime_threshold_conf == ULONG_MAX)
- strcpy(idle_time, " idle=max");
- else
- snprintf(idle_time, sizeof(idle_time), " idle=%lu",
- tg->idletime_threshold_conf);
-
- if (tg->latency_target_conf == ULONG_MAX)
- strcpy(latency_time, " latency=max");
- else
- snprintf(latency_time, sizeof(latency_time),
- " latency=%lu", tg->latency_target_conf);
- }
-
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
- latency_time);
+ if (tg->bps[READ] != U64_MAX)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+ if (tg->bps[WRITE] != U64_MAX)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+ if (tg->iops[READ] != UINT_MAX)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+ if (tg->iops[WRITE] != UINT_MAX)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3]);
return 0;
}
@@ -1566,10 +1384,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
- unsigned long idle_time;
- unsigned long latency_time;
int ret;
- int index = of_cft(of)->private;
blkg_conf_init(&ctx, buf);
@@ -1580,13 +1395,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
- v[0] = tg->bps_conf[READ][index];
- v[1] = tg->bps_conf[WRITE][index];
- v[2] = tg->iops_conf[READ][index];
- v[3] = tg->iops_conf[WRITE][index];
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
- idle_time = tg->idletime_threshold_conf;
- latency_time = tg->latency_target_conf;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
@@ -1618,60 +1431,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
v[2] = min_t(u64, val, UINT_MAX);
else if (!strcmp(tok, "wiops") && val > 1)
v[3] = min_t(u64, val, UINT_MAX);
- else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
- idle_time = val;
- else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
- latency_time = val;
else
goto out_finish;
}
- tg->bps_conf[READ][index] = v[0];
- tg->bps_conf[WRITE][index] = v[1];
- tg->iops_conf[READ][index] = v[2];
- tg->iops_conf[WRITE][index] = v[3];
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
- if (index == LIMIT_MAX) {
- tg->bps[READ][index] = v[0];
- tg->bps[WRITE][index] = v[1];
- tg->iops[READ][index] = v[2];
- tg->iops[WRITE][index] = v[3];
- }
- tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
- tg->bps_conf[READ][LIMIT_MAX]);
- tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
- tg->bps_conf[WRITE][LIMIT_MAX]);
- tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
- tg->iops_conf[READ][LIMIT_MAX]);
- tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
- tg->iops_conf[WRITE][LIMIT_MAX]);
- tg->idletime_threshold_conf = idle_time;
- tg->latency_target_conf = latency_time;
-
- /* force user to configure all settings for low limit */
- if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
- tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
- tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
- tg->latency_target_conf == DFL_LATENCY_TARGET) {
- tg->bps[READ][LIMIT_LOW] = 0;
- tg->bps[WRITE][LIMIT_LOW] = 0;
- tg->iops[READ][LIMIT_LOW] = 0;
- tg->iops[WRITE][LIMIT_LOW] = 0;
- tg->idletime_threshold = DFL_IDLE_THRESHOLD;
- tg->latency_target = DFL_LATENCY_TARGET;
- } else if (index == LIMIT_LOW) {
- tg->idletime_threshold = tg->idletime_threshold_conf;
- tg->latency_target = tg->latency_target_conf;
- }
-
- blk_throtl_update_limit_valid(tg->td);
- if (tg->td->limit_valid[LIMIT_LOW]) {
- if (index == LIMIT_LOW)
- tg->td->limit_index = LIMIT_LOW;
- } else
- tg->td->limit_index = LIMIT_MAX;
- tg_conf_updated(tg, index == LIMIT_LOW &&
- tg->td->limit_valid[LIMIT_LOW]);
+ tg_conf_updated(tg, false);
ret = 0;
out_finish:
blkg_conf_exit(&ctx);
@@ -1679,21 +1448,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
}
static struct cftype throtl_files[] = {
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- {
- .name = "low",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_limit,
- .write = tg_set_limit,
- .private = LIMIT_LOW,
- },
-#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = tg_print_limit,
.write = tg_set_limit,
- .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1712,7 +1471,6 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
- .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
@@ -1761,418 +1519,6 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
spin_unlock_irq(&q->queue_lock);
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- unsigned long rtime = jiffies, wtime = jiffies;
-
- if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
- rtime = tg->last_low_overflow_time[READ];
- if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
- wtime = tg->last_low_overflow_time[WRITE];
- return min(rtime, wtime);
-}
-
-static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
-{
- struct throtl_service_queue *parent_sq;
- struct throtl_grp *parent = tg;
- unsigned long ret = __tg_last_low_overflow_time(tg);
-
- while (true) {
- parent_sq = parent->service_queue.parent_sq;
- parent = sq_to_tg(parent_sq);
- if (!parent)
- break;
-
- /*
- * The parent doesn't have low limit, it always reaches low
- * limit. Its overflow time is useless for children
- */
- if (!parent->bps[READ][LIMIT_LOW] &&
- !parent->iops[READ][LIMIT_LOW] &&
- !parent->bps[WRITE][LIMIT_LOW] &&
- !parent->iops[WRITE][LIMIT_LOW])
- continue;
- if (time_after(__tg_last_low_overflow_time(parent), ret))
- ret = __tg_last_low_overflow_time(parent);
- }
- return ret;
-}
-
-static bool throtl_tg_is_idle(struct throtl_grp *tg)
-{
- /*
- * cgroup is idle if:
- * - single idle is too long, longer than a fixed value (in case user
- * configure a too big threshold) or 4 times of idletime threshold
- * - average think time is more than threshold
- * - IO latency is largely below threshold
- */
- unsigned long time;
- bool ret;
-
- time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
- ret = tg->latency_target == DFL_LATENCY_TARGET ||
- tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
- (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
- tg->avg_idletime > tg->idletime_threshold ||
- (tg->latency_target && tg->bio_cnt &&
- tg->bad_bio_cnt * 5 < tg->bio_cnt);
- throtl_log(&tg->service_queue,
- "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
- tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
- tg->bio_cnt, ret, tg->td->scale);
- return ret;
-}
-
-static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
-{
- struct throtl_service_queue *sq = &tg->service_queue;
- bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
-
- /*
- * if low limit is zero, low limit is always reached.
- * if low limit is non-zero, we can check if there is any request
- * is queued to determine if low limit is reached as we throttle
- * request according to limit.
- */
- return !limit || sq->nr_queued[rw];
-}
-
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
-{
- /*
- * cgroup reaches low limit when low limit of READ and WRITE are
- * both reached, it's ok to upgrade to next limit if cgroup reaches
- * low limit
- */
- if (throtl_low_limit_reached(tg, READ) &&
- throtl_low_limit_reached(tg, WRITE))
- return true;
-
- if (time_after_eq(jiffies,
- tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
- throtl_tg_is_idle(tg))
- return true;
- return false;
-}
-
-static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
-{
- while (true) {
- if (throtl_tg_can_upgrade(tg))
- return true;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- return false;
- }
- return false;
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- if (td->limit_index != LIMIT_LOW)
- return false;
-
- if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
- return false;
-
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
-
- if (tg == this_tg)
- continue;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- continue;
- if (!throtl_hierarchy_can_upgrade(tg)) {
- rcu_read_unlock();
- return false;
- }
- }
- rcu_read_unlock();
- return true;
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_LOW)
- return;
-
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- tg->last_check_time = now;
-
- if (!time_after_eq(now,
- __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
- return;
-
- if (throtl_can_upgrade(tg->td, NULL))
- throtl_upgrade_state(tg->td);
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
- struct cgroup_subsys_state *pos_css;
- struct blkcg_gq *blkg;
-
- throtl_log(&td->service_queue, "upgrade to max");
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->scale = 0;
- rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
- tg->disptime = jiffies - 1;
- throtl_select_dispatch(sq);
- throtl_schedule_next_dispatch(sq, true);
- }
- rcu_read_unlock();
- throtl_select_dispatch(&td->service_queue);
- throtl_schedule_next_dispatch(&td->service_queue, true);
- queue_work(kthrotld_workqueue, &td->dispatch_work);
-}
-
-static void throtl_downgrade_state(struct throtl_data *td)
-{
- td->scale /= 2;
-
- throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
- if (td->scale) {
- td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
- return;
- }
-
- td->limit_index = LIMIT_LOW;
- td->low_downgrade_time = jiffies;
-}
-
-static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
- unsigned long now = jiffies;
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (time_after_eq(now, tg_last_low_overflow_time(tg) +
- td->throtl_slice) &&
- (!throtl_tg_is_idle(tg) ||
- !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
- return true;
- return false;
-}
-
-static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
-{
- struct throtl_data *td = tg->td;
-
- if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
- return false;
-
- while (true) {
- if (!throtl_tg_can_downgrade(tg))
- return false;
- tg = sq_to_tg(tg->service_queue.parent_sq);
- if (!tg || !tg_to_blkg(tg)->parent)
- break;
- }
- return true;
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
- uint64_t bps;
- unsigned int iops;
- unsigned long elapsed_time;
- unsigned long now = jiffies;
-
- if (tg->td->limit_index != LIMIT_MAX ||
- !tg->td->limit_valid[LIMIT_LOW])
- return;
- if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
- return;
- if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
- return;
-
- elapsed_time = now - tg->last_check_time;
- tg->last_check_time = now;
-
- if (time_before(now, tg_last_low_overflow_time(tg) +
- tg->td->throtl_slice))
- return;
-
- if (tg->bps[READ][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[READ] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->bps[WRITE][LIMIT_LOW]) {
- bps = tg->last_bytes_disp[WRITE] * HZ;
- do_div(bps, elapsed_time);
- if (bps >= tg->bps[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- if (tg->iops[READ][LIMIT_LOW]) {
- iops = tg->last_io_disp[READ] * HZ / elapsed_time;
- if (iops >= tg->iops[READ][LIMIT_LOW])
- tg->last_low_overflow_time[READ] = now;
- }
-
- if (tg->iops[WRITE][LIMIT_LOW]) {
- iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
- if (iops >= tg->iops[WRITE][LIMIT_LOW])
- tg->last_low_overflow_time[WRITE] = now;
- }
-
- /*
- * If cgroup is below low limit, consider downgrade and throttle other
- * cgroups
- */
- if (throtl_hierarchy_can_downgrade(tg))
- throtl_downgrade_state(tg->td);
-
- tg->last_bytes_disp[READ] = 0;
- tg->last_bytes_disp[WRITE] = 0;
- tg->last_io_disp[READ] = 0;
- tg->last_io_disp[WRITE] = 0;
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
- unsigned long now;
- unsigned long last_finish_time = tg->last_finish_time;
-
- if (last_finish_time == 0)
- return;
-
- now = blk_time_get_ns() >> 10;
- if (now <= last_finish_time ||
- last_finish_time == tg->checked_last_finish_time)
- return;
-
- tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
- tg->checked_last_finish_time = last_finish_time;
-}
-
-static void throtl_update_latency_buckets(struct throtl_data *td)
-{
- struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
- int i, cpu, rw;
- unsigned long last_latency[2] = { 0 };
- unsigned long latency[2];
-
- if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
- return;
- if (time_before(jiffies, td->last_calculate_time + HZ))
- return;
- td->last_calculate_time = jiffies;
-
- memset(avg_latency, 0, sizeof(avg_latency));
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets[rw],
- cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
-
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
-
- latency[rw] = tmp->total_latency;
-
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency[rw] /= samples;
- if (latency[rw] == 0)
- continue;
- avg_latency[rw][i].latency = latency[rw];
- }
- }
- }
-
- for (rw = READ; rw <= WRITE; rw++) {
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[rw][i].latency) {
- if (td->avg_buckets[rw][i].latency < last_latency[rw])
- td->avg_buckets[rw][i].latency =
- last_latency[rw];
- continue;
- }
-
- if (!td->avg_buckets[rw][i].valid)
- latency[rw] = avg_latency[rw][i].latency;
- else
- latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
- avg_latency[rw][i].latency) >> 3;
-
- td->avg_buckets[rw][i].latency = max(latency[rw],
- last_latency[rw]);
- td->avg_buckets[rw][i].valid = true;
- last_latency[rw] = td->avg_buckets[rw][i].latency;
- }
- }
-
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- throtl_log(&td->service_queue,
- "Latency bucket %d: read latency=%ld, read valid=%d, "
- "write latency=%ld, write valid=%d", i,
- td->avg_buckets[READ][i].latency,
- td->avg_buckets[READ][i].valid,
- td->avg_buckets[WRITE][i].latency,
- td->avg_buckets[WRITE][i].valid);
-}
-#else
-static inline void throtl_update_latency_buckets(struct throtl_data *td)
-{
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
- struct throtl_grp *this_tg)
-{
- return false;
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
-}
-#endif
-
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -2185,21 +1531,12 @@ bool __blk_throtl_bio(struct bio *bio)
struct throtl_data *td = tg->td;
rcu_read_lock();
-
spin_lock_irq(&q->queue_lock);
-
- throtl_update_latency_buckets(td);
-
- blk_throtl_update_idletime(tg);
-
sq = &tg->service_queue;
-again:
while (true) {
if (tg->last_low_overflow_time[rw] == 0)
tg->last_low_overflow_time[rw] = jiffies;
- throtl_downgrade_check(tg);
- throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
@@ -2207,10 +1544,6 @@ bool __blk_throtl_bio(struct bio *bio)
/* if above limits, break to queue */
if (!tg_may_dispatch(tg, bio, NULL)) {
tg->last_low_overflow_time[rw] = jiffies;
- if (throtl_can_upgrade(td, tg)) {
- throtl_upgrade_state(td);
- goto again;
- }
break;
}
@@ -2270,101 +1603,12 @@ bool __blk_throtl_bio(struct bio *bio)
}
out_unlock:
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- if (throttled || !td->track_bio_latency)
- bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
-#endif
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
return throttled;
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void throtl_track_latency(struct throtl_data *td, sector_t size,
- enum req_op op, unsigned long time)
-{
- const bool rw = op_is_write(op);
- struct latency_bucket *latency;
- int index;
-
- if (!td || td->limit_index != LIMIT_LOW ||
- !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
- !blk_queue_nonrot(td->queue))
- return;
-
- index = request_bucket_index(size);
-
- latency = get_cpu_ptr(td->latency_buckets[rw]);
- latency[index].total_latency += time;
- latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[rw]);
-}
-
-void blk_throtl_stat_add(struct request *rq, u64 time_ns)
-{
- struct request_queue *q = rq->q;
- struct throtl_data *td = q->td;
-
- throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
- time_ns >> 10);
-}
-
-void blk_throtl_bio_endio(struct bio *bio)
-{
- struct blkcg_gq *blkg;
- struct throtl_grp *tg;
- u64 finish_time_ns;
- unsigned long finish_time;
- unsigned long start_time;
- unsigned long lat;
- int rw = bio_data_dir(bio);
-
- blkg = bio->bi_blkg;
- if (!blkg)
- return;
- tg = blkg_to_tg(blkg);
- if (!tg->td->limit_valid[LIMIT_LOW])
- return;
-
- finish_time_ns = blk_time_get_ns();
- tg->last_finish_time = finish_time_ns >> 10;
-
- start_time = bio_issue_time(&bio->bi_issue) >> 10;
- finish_time = __bio_issue_time(finish_time_ns) >> 10;
- if (!start_time || finish_time <= start_time)
- return;
-
- lat = finish_time - start_time;
- /* this is only for bio based driver */
- if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
- throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
- bio_op(bio), lat);
-
- if (tg->latency_target && lat >= tg->td->filtered_latency) {
- int bucket;
- unsigned int threshold;
-
- bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
- threshold = tg->td->avg_buckets[rw][bucket].latency +
- tg->latency_target;
- if (lat > threshold)
- tg->bad_bio_cnt++;
- /*
- * Not race free, could get wrong count, which means cgroups
- * will be throttled
- */
- tg->bio_cnt++;
- }
-
- if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
- tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
- tg->bio_cnt /= 2;
- tg->bad_bio_cnt /= 2;
- }
-}
-#endif
-
int blk_throtl_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -2374,19 +1618,6 @@ int blk_throtl_init(struct gendisk *disk)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
- td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[READ]) {
- kfree(td);
- return -ENOMEM;
- }
- td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
- LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets[WRITE]) {
- free_percpu(td->latency_buckets[READ]);
- kfree(td);
- return -ENOMEM;
- }
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
@@ -2394,18 +1625,10 @@ int blk_throtl_init(struct gendisk *disk)
q->td = td;
td->queue = q;
- td->limit_valid[LIMIT_MAX] = true;
- td->limit_index = LIMIT_MAX;
- td->low_upgrade_time = jiffies;
- td->low_downgrade_time = jiffies;
-
/* activate policy */
ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
- if (ret) {
- free_percpu(td->latency_buckets[READ]);
- free_percpu(td->latency_buckets[WRITE]);
+ if (ret)
kfree(td);
- }
return ret;
}
@@ -2417,8 +1640,6 @@ void blk_throtl_exit(struct gendisk *disk)
del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets[READ]);
- free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
@@ -2426,58 +1647,18 @@ void blk_throtl_register(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct throtl_data *td;
- int i;
td = q->td;
BUG_ON(!td);
- if (blk_queue_nonrot(q)) {
+ if (blk_queue_nonrot(q))
td->throtl_slice = DFL_THROTL_SLICE_SSD;
- td->filtered_latency = LATENCY_FILTERED_SSD;
- } else {
+ else
td->throtl_slice = DFL_THROTL_SLICE_HD;
- td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
- td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
- }
- }
-#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
- /* if no low limit, use previous default */
- td->throtl_slice = DFL_THROTL_SLICE_HD;
-
-#else
td->track_bio_latency = !queue_is_mq(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
-#endif
-}
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
-{
- if (!q->td)
- return -EINVAL;
- return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
-}
-
-ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count)
-{
- unsigned long v;
- unsigned long t;
-
- if (!q->td)
- return -EINVAL;
- if (kstrtoul(page, 10, &v))
- return -EINVAL;
- t = msecs_to_jiffies(v);
- if (t == 0 || t > MAX_THROTL_SLICE)
- return -EINVAL;
- q->td->throtl_slice = t;
- return count;
}
-#endif
static int __init throtl_init(void)
{
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index bffbc9cfc8ab..32503fd83a84 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -58,12 +58,6 @@ enum tg_state_flags {
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
};
-enum {
- LIMIT_LOW,
- LIMIT_MAX,
- LIMIT_CNT,
-};
-
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -102,14 +96,14 @@ struct throtl_grp {
bool has_rules_iops[2];
/* internally used bytes per second rate limits */
- uint64_t bps[2][LIMIT_CNT];
+ uint64_t bps[2];
/* user configured bps limits */
- uint64_t bps_conf[2][LIMIT_CNT];
+ uint64_t bps_conf[2];
/* internally used IOPS limits */
- unsigned int iops[2][LIMIT_CNT];
+ unsigned int iops[2];
/* user configured IOPS limits */
- unsigned int iops_conf[2][LIMIT_CNT];
+ unsigned int iops_conf[2];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
@@ -132,22 +126,10 @@ struct throtl_grp {
unsigned long last_check_time;
- unsigned long latency_target; /* us */
- unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
- unsigned long last_finish_time; /* ns / 1024 */
- unsigned long checked_last_finish_time; /* ns / 1024 */
- unsigned long avg_idletime; /* ns / 1024 */
- unsigned long idletime_threshold; /* us */
- unsigned long idletime_threshold_conf; /* us */
-
- unsigned int bio_cnt; /* total bios */
- unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
- unsigned long bio_cnt_reset_time;
-
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
diff --git a/block/blk.h b/block/blk.h
index 5cac4e29ae17..47960bf31543 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -379,17 +379,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
- const char *page, size_t count);
-extern void blk_throtl_bio_endio(struct bio *bio);
-extern void blk_throtl_stat_add(struct request *rq, u64 time);
-#else
-static inline void blk_throtl_bio_endio(struct bio *bio) { }
-static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
-#endif
-
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
--
2.39.2
On Sat, Apr 06, 2024 at 04:00:54PM +0800, Yu Kuai wrote:
> From: Yu Kuai <[email protected]>
>
> One the one hand, it's marked EXPERIMENTAL since 2017, and looks like
> there are no users since then, and no testers and no developers, it's
> just not active at all.
>
> On the other hand, even if the config is disabled, there are still many
> fields in throtl_grp and throtl_data and many functions that are only
> used for throtl low.
>
> At last, currently blk-throtl is initialized during disk initialization,
> and destroyed during disk removal, and it exposes many functions to be
> called directly from block layer. Hence I'm planning to optimize
> blk-throtl and finially support building it as kernel module. Remove
> throtl low will make the work much easier.
>
> Signed-off-by: Yu Kuai <[email protected]>
Acked-by: Tejun Heo <[email protected]>
I haven't seen any usage but let's see if anyone complains.
Thanks.
--
tejun
Hello,
On Sat, Apr 06, 2024 at 04:00:58PM +0800, Yu Kuai wrote:
> From: Yu Kuai <[email protected]>
>
> Currently once blk-throttle is enabled, it can't be destroyed until disk
> removal, even it's disabled.
>
> Also prepare to support building it as kernel module.
The benefit of doing this whenever the ruleset becomes empty seems marginal.
This isn't necessary to allow unloading blk-throttle and
blkg_conf_exit_blkg() is also necessary because of this, right?
Thanks.
--
tejun
Hello,
On Sat, Apr 06, 2024 at 04:00:59PM +0800, Yu Kuai wrote:
> From: Yu Kuai <[email protected]>
>
> To avoid exposing blk-throttle internal implementation to general block
> layer.
..
> @@ -832,7 +832,7 @@ void submit_bio_noacct(struct bio *bio)
> goto not_supported;
> }
>
> - if (blk_throtl_bio(bio))
> + if (rq_qos_throttle_bio(q, bio))
> return;
> submit_bio_noacct_nocheck(bio);
> return;
This is a half-way conversion, right? You're adding a dedicated hook to
rq_qos and none of the other hooks can be used by blk-throtl. Even the name,
rq_qos_throttle_bio(), becomes a misnomer. I'm not really sure this makes
things better or worse. It makes certain things a bit cleaner but other
things nastier. I don't know.
Thanks.
--
tejun
Hi,
?? 2024/04/13 1:30, Tejun Heo д??:
> On Sat, Apr 06, 2024 at 04:00:54PM +0800, Yu Kuai wrote:
>> From: Yu Kuai <[email protected]>
>>
>> One the one hand, it's marked EXPERIMENTAL since 2017, and looks like
>> there are no users since then, and no testers and no developers, it's
>> just not active at all.
>>
>> On the other hand, even if the config is disabled, there are still many
>> fields in throtl_grp and throtl_data and many functions that are only
>> used for throtl low.
>>
>> At last, currently blk-throtl is initialized during disk initialization,
>> and destroyed during disk removal, and it exposes many functions to be
>> called directly from block layer. Hence I'm planning to optimize
>> blk-throtl and finially support building it as kernel module. Remove
>> throtl low will make the work much easier.
>>
>> Signed-off-by: Yu Kuai <[email protected]>
>
> Acked-by: Tejun Heo <[email protected]>
>
> I haven't seen any usage but let's see if anyone complains.
Thanks for the review!
Kuai
>
> Thanks.
>
Hi,
?? 2024/04/13 2:05, Tejun Heo д??:
> Hello,
>
> On Sat, Apr 06, 2024 at 04:00:58PM +0800, Yu Kuai wrote:
>> From: Yu Kuai <[email protected]>
>>
>> Currently once blk-throttle is enabled, it can't be destroyed until disk
>> removal, even it's disabled.
>>
>> Also prepare to support building it as kernel module.
>
> The benefit of doing this whenever the ruleset becomes empty seems marginal.
> This isn't necessary to allow unloading blk-throttle and
> blkg_conf_exit_blkg() is also necessary because of this, right?
Yes, this is why blkg_conf_exit_blkg() is necessary.
I think that we need find an appropriate time to unload blk-throttle
other than deleting the gendisk. I also think of adding a new user input
like "8:0 free" to do this. These are the solutions that I can think of
for now.
Thanks,
Kuai
>
> Thanks.
>
Hi,
?? 2024/04/13 2:11, Tejun Heo д??:
> Hello,
>
> On Sat, Apr 06, 2024 at 04:00:59PM +0800, Yu Kuai wrote:
>> From: Yu Kuai <[email protected]>
>>
>> To avoid exposing blk-throttle internal implementation to general block
>> layer.
> ...
>> @@ -832,7 +832,7 @@ void submit_bio_noacct(struct bio *bio)
>> goto not_supported;
>> }
>>
>> - if (blk_throtl_bio(bio))
>> + if (rq_qos_throttle_bio(q, bio))
>> return;
>> submit_bio_noacct_nocheck(bio);
>> return;
>
> This is a half-way conversion, right? You're adding a dedicated hook to
> rq_qos and none of the other hooks can be used by blk-throtl. Even the name,
> rq_qos_throttle_bio(), becomes a misnomer. I'm not really sure this makes
> things better or worse. It makes certain things a bit cleaner but other
> things nastier. I don't know.
Yes, the final goal is making all blk-cgroup policies modular, and this
patch use rq-qos to prevent exposing blk-throtle to block layer, like
other policies.
There is another choice that I think is feasible:
Let blk-throttle ping a policy id, and use the id to call throttle
function directly, this will require initializing the 'plid' from
blkcg_policy() during definition instead of blkcg_policy_register().
Thanks,
Kuai
>
> Thanks.
>
Hi,
在 2024/04/13 10:17, Yu Kuai 写道:
> Hi,
>
> 在 2024/04/13 2:11, Tejun Heo 写道:
>> Hello,
>>
>> On Sat, Apr 06, 2024 at 04:00:59PM +0800, Yu Kuai wrote:
>>> From: Yu Kuai <[email protected]>
>>>
>>> To avoid exposing blk-throttle internal implementation to general block
>>> layer.
>> ...
>>> @@ -832,7 +832,7 @@ void submit_bio_noacct(struct bio *bio)
>>> goto not_supported;
>>> }
>>> - if (blk_throtl_bio(bio))
>>> + if (rq_qos_throttle_bio(q, bio))
>>> return;
>>> submit_bio_noacct_nocheck(bio);
>>> return;
>>
>> This is a half-way conversion, right? You're adding a dedicated hook to
>> rq_qos and none of the other hooks can be used by blk-throtl. Even the
>> name,
Actually, rq_qos_exit() is used as well for destroy blk-throtl.
>> rq_qos_throttle_bio(), becomes a misnomer. I'm not really sure this makes
>> things better or worse. It makes certain things a bit cleaner but other
>> things nastier. I don't know.
>
> Yes, the final goal is making all blk-cgroup policies modular, and this
> patch use rq-qos to prevent exposing blk-throtle to block layer, like
> other policies.
After thinking this a bit more, I still think probably rq_qos is a
better choice, and there is something that I want to discuss.
There are two different direction, first is swith blk-throttle to
rq_qos_throttle() as well, which is called for each rq:
1) For, rq-based device, why blk-throtl must throttle before
rq_qos_throttle()? And blk-throtl have to handle the bio split case
seperately. And it looks like blk-throttle can switch to use
rq_qos_throttle() with the benefit that io split does't need
special handling.
2) blk-throtl treats split IO as additional iops, while it ignores
merge IO, this looks wrong to me. If multiple bio merged into one
request, iostat will see just one IO. And after switching to rq_qos,
bio merge case can be handled easily as well.
Another is still add a rq_qos_throttle_bio(perhaps another name?), and
meanwhile iocost can benefit from this new helper as well. Because
iocost really is based on bio, currently it must handle the io merge
case by debt.
Thanks,
Kuai
>
> There is another choice that I think is feasible:
>
> Let blk-throttle ping a policy id, and use the id to call throttle
> function directly, this will require initializing the 'plid' from
> blkcg_policy() during definition instead of blkcg_policy_register().
>
> Thanks,
> Kuai
>
>>
>> Thanks.
>>
>
> .
>
On Sat, Apr 06, 2024 at 04:00:54PM +0800, Yu Kuai <[email protected]> wrote:
> Documentation/ABI/stable/sysfs-block | 12 -
> arch/loongarch/configs/loongson3_defconfig | 1 -
> block/Kconfig | 11 -
> block/bio.c | 1 -
> block/blk-stat.c | 3 -
> block/blk-sysfs.c | 7 -
> block/blk-throttle.c | 901 +--------------------
> block/blk-throttle.h | 26 +-
> block/blk.h | 11 -
> 9 files changed, 45 insertions(+), 928 deletions(-)
I'm (also) not aware of any users of (be)low throttling.
I like this cleanup.
Michal
On Sat, Apr 06, 2024 at 04:00:53PM +0800, Yu Kuai <[email protected]> wrote:
> I'm planning to support build all blk-throttle polices as kernel module,
There is only one blk-throttle policy (especially after your removal of
io.low). Did you mean blkcg policies in general?
The current code is complex because of various lifecycles in
devices x cgroups.
Turning policies into modules seems to make it
devices x cgroups x policy modules
.
Could you please add more info why policies as modules is beneficial,
how to keep complexity capped?
Thanks,
Michal
Hello,
On Sat, Apr 13, 2024 at 10:06:00AM +0800, Yu Kuai wrote:
> I think that we need find an appropriate time to unload blk-throttle
> other than deleting the gendisk. I also think of adding a new user input
> like "8:0 free" to do this. These are the solutions that I can think of
> for now.
Probably a better interface is for unloading to force blk-throtl to be
deactivated rather than asking the user to nuke all configs.
Thanks.
--
tejun
Hello,
On Tue, Apr 16, 2024 at 10:17:29PM +0800, Yu Kuai wrote:
> > > This is a half-way conversion, right? You're adding a dedicated hook to
> > > rq_qos and none of the other hooks can be used by blk-throtl. Even
> > > the name,
>
> Actually, rq_qos_exit() is used as well for destroy blk-throtl.
I see.
> > > rq_qos_throttle_bio(), becomes a misnomer. I'm not really sure this makes
> > > things better or worse. It makes certain things a bit cleaner but other
> > > things nastier. I don't know.
> >
> > Yes, the final goal is making all blk-cgroup policies modular, and this
> > patch use rq-qos to prevent exposing blk-throtle to block layer, like
> > other policies.
>
> After thinking this a bit more, I still think probably rq_qos is a
> better choice, and there is something that I want to discuss.
>
> There are two different direction, first is swith blk-throttle to
> rq_qos_throttle() as well, which is called for each rq:
>
> 1) For, rq-based device, why blk-throtl must throttle before
> rq_qos_throttle()? And blk-throtl have to handle the bio split case
> seperately. And it looks like blk-throttle can switch to use
> rq_qos_throttle() with the benefit that io split does't need
> special handling.
>
> 2) blk-throtl treats split IO as additional iops, while it ignores
> merge IO, this looks wrong to me. If multiple bio merged into one
> request, iostat will see just one IO. And after switching to rq_qos,
> bio merge case can be handled easily as well.
If we could properly convert blk-throtl to rq-qos, that'd be great. The only
problem is that there probably are users who are using blk-throtl with
bio-based drivers because blk-throtl has supported that for a very long
time, so we're in a bit of bind for blk-throtl.
> Another is still add a rq_qos_throttle_bio(perhaps another name?), and
> meanwhile iocost can benefit from this new helper as well. Because
> iocost really is based on bio, currently it must handle the io merge
> case by debt.
I really don't think adding a different operation mode to rq-qos or iocost
is a good idea. I'd much rather keep blk-throtl where it is and come up with
a better replacement (e.g. iocost based .max throttling) in the future.
Thanks.
--
tejun
Hi,
在 2024/04/16 23:56, Michal Koutný 写道:
> On Sat, Apr 06, 2024 at 04:00:53PM +0800, Yu Kuai <[email protected]> wrote:
>> I'm planning to support build all blk-throttle polices as kernel module,
>
> There is only one blk-throttle policy (especially after your removal of
> io.low). Did you mean blkcg policies in general?
Yes, bfq already support that, and others are all rq_qos based, they
will be much easier than blk-throtl.
>
> The current code is complex because of various lifecycles in
> devices x cgroups.
> Turning policies into modules seems to make it
> devices x cgroups x policy modules
> .
>
> Could you please add more info why policies as modules is beneficial,
> how to keep complexity capped?
First of all, users can only load these policies when they need, and
reduce kernel size; Then, when these policies is not loaded, IO fast
path will be slightly shorter, and save some memory overhead for each
disk.
Thanks,
Kuai
>
> Thanks,
> Michal
>
Hi,
?? 2024/04/17 1:09, Tejun Heo д??:
> Hello,
>
> On Sat, Apr 13, 2024 at 10:06:00AM +0800, Yu Kuai wrote:
>> I think that we need find an appropriate time to unload blk-throttle
>> other than deleting the gendisk. I also think of adding a new user input
>> like "8:0 free" to do this. These are the solutions that I can think of
>> for now.
>
> Probably a better interface is for unloading to force blk-throtl to be
> deactivated rather than asking the user to nuke all configs.
I was thinking that rmmod in this case should return busy, for example,
if bfq is currently used for some disk, rmmod bfq will return busy.
Is there any example that unloading will deactivate resources that users
are still using?
Thanks,
Kuai
>
> Thanks.
>
Hello,
On Wed, Apr 17, 2024 at 09:13:34AM +0800, Yu Kuai wrote:
> > Probably a better interface is for unloading to force blk-throtl to be
> > deactivated rather than asking the user to nuke all configs.
>
> I was thinking that rmmod in this case should return busy, for example,
> if bfq is currently used for some disk, rmmod bfq will return busy.
>
> Is there any example that unloading will deactivate resources that users
> are still using?
Hmm... yeah, I'm not sure. Pinning the module while in use is definitely
more conventional, so let's stick with that. It's usually achieved by
inc'ing the module's ref on each usage, so here, the module refs would be
counting the number of active rules, I guess.
I'm not sure about modularization tho mostly because we've historically had
a lot of lifetime issues around block and blkcg data structures and the
supposed gain here is rather minimal. We only have a handful of these
policies and they aren't that big.
If hot path overhead when not being used is concern, lazy init solves most
of it, no?
Thanks.
--
tejun
Hi,
?? 2024/04/17 9:22, Tejun Heo д??:
> Hello,
>
> On Wed, Apr 17, 2024 at 09:13:34AM +0800, Yu Kuai wrote:
>>> Probably a better interface is for unloading to force blk-throtl to be
>>> deactivated rather than asking the user to nuke all configs.
>>
>> I was thinking that rmmod in this case should return busy, for example,
>> if bfq is currently used for some disk, rmmod bfq will return busy.
>>
>> Is there any example that unloading will deactivate resources that users
>> are still using?
>
> Hmm... yeah, I'm not sure. Pinning the module while in use is definitely
> more conventional, so let's stick with that. It's usually achieved by
> inc'ing the module's ref on each usage, so here, the module refs would be
> counting the number of active rules, I guess.
Yes, aggred.
>
> I'm not sure about modularization tho mostly because we've historically had
> a lot of lifetime issues around block and blkcg data structures and the
> supposed gain here is rather minimal. We only have a handful of these
> policies and they aren't that big.
>
> If hot path overhead when not being used is concern, lazy init solves most
> of it, no?
For performance, yes. Another point is that we can reduce kernel size
this way, without losing support for blk-cgroup policies.
Yes, it's just blk-throttle is the most pain in the ass becasue it
exposed lots of implementations to block layer. Other rq_qos based
policy should be much easier.
I guess I'll do lazy init first, and then modularization for rq_qos,
and leave blk-throtl there for now. Perhaps add a new throtl model in
iocost can replace blk-throtl in the future.
BTW, currently during test of iocost, I found that iocost can already
achieve that, for example, by following configure:
echo "$dev enable=1 min=100 max=100" > qos
echo "$dev wbps=4096 wseqiops=1 wrandiops=1" > model
In the test, I found that wbps and iops is actually limited to the
set value.
Thanks,
Kuai
>
> Thanks.
>
On Wed, Apr 17, 2024 at 09:09:07AM +0800, Yu Kuai <[email protected]> wrote:
> Yes, bfq already support that,
I've never noticed CONFIG_IOSCHED_BFQ is a tristate that explains (me) a
lot. Thanks!
> First of all, users can only load these policies when they need, and
> reduce kernel size; Then, when these policies is not loaded, IO fast
> path will be slightly shorter, and save some memory overhead for each
> disk.
..and there is no new complexity thanks to the above.
(I'm only catching up with subthread of patch 5/6.)
It seems the old complexity could be simplified by the way of lazy
inits. Intereseting...
Michal
Hello,
On Wed, Apr 17, 2024 at 09:39:43AM +0800, Yu Kuai wrote:
..
> I guess I'll do lazy init first, and then modularization for rq_qos,
> and leave blk-throtl there for now. Perhaps add a new throtl model in
> iocost can replace blk-throtl in the future.
That sounds like a plan.
> BTW, currently during test of iocost, I found that iocost can already
> achieve that, for example, by following configure:
>
> echo "$dev enable=1 min=100 max=100" > qos
> echo "$dev wbps=4096 wseqiops=1 wrandiops=1" > model
>
> In the test, I found that wbps and iops is actually limited to the
> set value.
Yeah, it shouldn't be too difficult to add .max support to iocost so that
you can say something like "this cgroup subtree can't use more than 60% of
available capacity". That'd be really cool to have.
Thanks.
--
tejun