Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751829Ab3JTMOg (ORCPT ); Sun, 20 Oct 2013 08:14:36 -0400 Received: from mail-pb0-f47.google.com ([209.85.160.47]:61478 "EHLO mail-pb0-f47.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751714Ab3JTMOc (ORCPT ); Sun, 20 Oct 2013 08:14:32 -0400 From: Hong Zhiguo To: tj@kernel.org, vgoyal@redhat.com Cc: cgroups@vger.kernel.org, axboe@kernel.dk, linux-kernel@vger.kernel.org, Hong Zhiguo Subject: [PATCH v4 1/2] blk-throttle: simplify logic by token bucket algorithm Date: Sun, 20 Oct 2013 20:11:11 +0800 Message-Id: <1382271072-15664-2-git-send-email-zhiguohong@tencent.com> X-Mailer: git-send-email 1.8.1.2 In-Reply-To: <1382271072-15664-1-git-send-email-zhiguohong@tencent.com> References: <20131018155532.GD2277@redhat.com> <1382271072-15664-1-git-send-email-zhiguohong@tencent.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 14044 Lines: 453 Token bucket algorithm(http://en.wikipedia.org/wiki/Token_bucket) is very simple and widely used for rate limiting and shaping. It's well suitable for blk-throttle. And it naturally works for hierarchical cgroups. So I took it to replace the original time _slicing_|_trimming_|_extending_ logic. The advantage is simplicity, reliability and less fluctuation. About 300 lines of code for time-slicing is replaced with 60 lines of code for token bucket in blk-throttle.c. I've tested this patch by fio with rw=randread, rw=randrw. It behaves almost the same with original time-slicing implementation, and with less fluctuation. It's also tested on hierarchical setup. Signed-off-by: Hong Zhiguo Tested-by: Hong Zhiguo --- block/blk-throttle.c | 285 +++++++++------------------------------------------ 1 file changed, 50 insertions(+), 235 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8331aba..45d4d91 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2,6 +2,8 @@ * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal + * + * Simplify the logic with token bucket, Hong Zhiguo */ #include @@ -18,9 +20,6 @@ static int throtl_grp_quantum = 8; /* Total max dispatch from all groups in one round */ static int throtl_quantum = 32; -/* Throttling is performed over 100ms slice and after that slice is renewed */ -static unsigned long throtl_slice = HZ/10; /* 100 ms */ - static struct blkcg_policy blkcg_policy_throtl; /* A workqueue to queue throttle related work */ @@ -91,6 +90,11 @@ struct tg_stats_cpu { struct blkg_rwstat serviced; }; +/* Depth of iops token bucket */ +#define THROTL_BURST_IO 8 +/* Depth of bps token bucket */ +#define THROTL_BURST_BYTES (4096 * 8) + struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -133,14 +137,13 @@ struct throtl_grp { /* IOPS limits */ unsigned int iops[2]; - /* Number of bytes disptached in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; + /* Token for throttling by bps */ + uint64_t bytes_token[2]; + /* Token for throttling by iops */ + unsigned int io_token[2]; - /* When did we start a new slice */ - unsigned long slice_start[2]; - unsigned long slice_end[2]; + /* Time check-point */ + unsigned long t_c[2]; /* Per cpu stats pointer */ struct tg_stats_cpu __percpu *stats_cpu; @@ -680,171 +683,39 @@ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, return false; } -static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, - bool rw, unsigned long start) -{ - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; - - /* - * Previous slice has expired. We must have trimmed it after last - * bio dispatch. That means since start of last slice, we never used - * that bandwidth. Do try to make use of that bandwidth while giving - * credit. - */ - if (time_after_eq(start, tg->slice_start[rw])) - tg->slice_start[rw] = start; - - tg->slice_end[rw] = jiffies + throtl_slice; - throtl_log(&tg->service_queue, - "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', tg->slice_start[rw], - tg->slice_end[rw], jiffies); -} - -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) -{ - tg->bytes_disp[rw] = 0; - tg->io_disp[rw] = 0; - tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + throtl_slice; - throtl_log(&tg->service_queue, - "[%c] new slice start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', tg->slice_start[rw], - tg->slice_end[rw], jiffies); -} - -static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, - unsigned long jiffy_end) -{ - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); -} - -static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, - unsigned long jiffy_end) -{ - tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); - throtl_log(&tg->service_queue, - "[%c] extend slice start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', tg->slice_start[rw], - tg->slice_end[rw], jiffies); -} - -/* Determine if previously allocated or extended slice is complete or not */ -static bool throtl_slice_used(struct throtl_grp *tg, bool rw) -{ - if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) - return 0; - - return 1; -} - -/* Trim the used slices and adjust slice start accordingly */ -static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) +static void tg_update_token(struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, time_elapsed, io_trim; - u64 bytes_trim, tmp; + unsigned long tdiff = jiffies - tg->t_c[rw]; + u64 token; - BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); - - /* - * If bps are unlimited (-1), then time slice don't get - * renewed. Don't try to trim the slice if slice is used. A new - * slice will start when appropriate. - */ - if (throtl_slice_used(tg, rw)) + if (!tdiff) return; - /* - * A bio has been dispatched. Also adjust slice_end. It might happen - * that initially cgroup limit was very low resulting in high - * slice_end, but later limit was bumped up and bio was dispached - * sooner, then we need to reduce slice_end. A high bogus slice_end - * is bad because it does not allow new slice to start. - */ + token = (u64)tg->iops[rw] * tdiff; + do_div(token, HZ); + tg->io_token[rw] += token; - throtl_set_slice_end(tg, rw, jiffies + throtl_slice); + token = (u64)tg->bps[rw] * tdiff; + do_div(token, HZ); + tg->bytes_token[rw] += token; - time_elapsed = jiffies - tg->slice_start[rw]; - - nr_slices = time_elapsed / throtl_slice; - - if (!nr_slices) - return; - tmp = tg->bps[rw] * throtl_slice * nr_slices; - do_div(tmp, HZ); - bytes_trim = tmp; - - io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; - - if (!bytes_trim && !io_trim) - return; - - if (tg->bytes_disp[rw] >= bytes_trim) - tg->bytes_disp[rw] -= bytes_trim; - else - tg->bytes_disp[rw] = 0; - - if (tg->io_disp[rw] >= io_trim) - tg->io_disp[rw] -= io_trim; - else - tg->io_disp[rw] = 0; - - tg->slice_start[rw] += nr_slices * throtl_slice; - - throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, - tg->slice_start[rw], tg->slice_end[rw], jiffies); + tg->t_c[rw] = jiffies; } static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - unsigned int io_allowed; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - u64 tmp; - - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; - - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - /* - * jiffy_elapsed_rnd should not be a big value as minimum iops can be - * 1 then at max jiffy elapsed should be equivalent of 1 second as we - * will allow dispatch after 1 second and after that slice should - * have been trimmed. - */ - - tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; - do_div(tmp, HZ); - - if (tmp > UINT_MAX) - io_allowed = UINT_MAX; - else - io_allowed = tmp; - - if (tg->io_disp[rw] + 1 <= io_allowed) { + if (tg->io_token[rw]) { if (wait) *wait = 0; return 1; } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; - - if (jiffy_wait > jiffy_elapsed) - jiffy_wait = jiffy_wait - jiffy_elapsed; - else - jiffy_wait = 1; - if (wait) - *wait = jiffy_wait; + *wait = HZ / tg->iops[rw] ?: 1; return 0; } @@ -852,41 +723,21 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + u64 extra_bytes; + unsigned long jiffy_wait; - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = throtl_slice; - - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); - - tmp = tg->bps[rw] * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; - - if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (bio->bi_size <= tg->bytes_token[rw]) { if (wait) *wait = 0; return 1; } /* Calc approx time to dispatch */ - extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); - - if (!jiffy_wait) - jiffy_wait = 1; - - /* - * This wait time is without taking into consideration the rounding - * up we did. Add that time also. - */ - jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) - *wait = jiffy_wait; + if (wait) { + extra_bytes = bio->bi_size - tg->bytes_token[rw]; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + *wait = jiffy_wait ?: 1; + } return 0; } @@ -916,18 +767,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 1; } - /* - * If previous slice expired, start a new one otherwise renew/extend - * existing slice to make sure it is at least throtl_slice interval - * long since now. - */ - if (throtl_slice_used(tg, rw)) - throtl_start_new_slice(tg, rw); - else { - if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) - throtl_extend_slice(tg, rw, jiffies + throtl_slice); - } - + tg_update_token(tg, rw); if (tg_with_in_bps_limit(tg, bio, &bps_wait) && tg_with_in_iops_limit(tg, bio, &iops_wait)) { if (wait) @@ -940,9 +780,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, if (wait) *wait = max_wait; - if (time_before(tg->slice_end[rw], jiffies + max_wait)) - throtl_extend_slice(tg, rw, jiffies + max_wait); - return 0; } @@ -976,9 +813,16 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); - /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio->bi_size; - tg->io_disp[rw]++; + tg_update_token(tg, rw); + + /* Charge the bio to the group and trim the bucket */ + tg->bytes_token[rw] -= bio->bi_size; + if (tg->bytes_token[rw] > THROTL_BURST_BYTES) + tg->bytes_token[rw] = THROTL_BURST_BYTES; + + tg->io_token[rw]--; + if (tg->io_token[rw] > THROTL_BURST_IO) + tg->io_token[rw] = THROTL_BURST_IO; /* * REQ_THROTTLED is used to prevent the same bio to be throttled @@ -1055,16 +899,6 @@ static void tg_update_disptime(struct throtl_grp *tg) tg->flags &= ~THROTL_TG_WAS_EMPTY; } -static void start_parent_slice_with_credit(struct throtl_grp *child_tg, - struct throtl_grp *parent_tg, bool rw) -{ - if (throtl_slice_used(parent_tg, rw)) { - throtl_start_new_slice_with_credit(parent_tg, rw, - child_tg->slice_start[rw]); - } - -} - static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; @@ -1093,7 +927,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); - start_parent_slice_with_credit(tg, parent_tg, rw); } else { throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); @@ -1101,8 +934,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) tg->td->nr_queued[rw]--; } - throtl_trim_slice(tg, rw); - if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } @@ -1386,12 +1217,8 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * - * Restart the slices for both READ and WRITES. It might happen - * that a group's limit are dropped suddenly and we don't want to - * account recently dispatched IO with new low rate. + * Update dispatch time. */ - throtl_start_new_slice(tg, 0); - throtl_start_new_slice(tg, 1); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1527,19 +1354,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) throtl_charge_bio(tg, bio); /* - * We need to trim slice even when bios are not being queued - * otherwise it might happen that a bio is not queued for - * a long time and slice keeps on extending and trim is not - * called for a long time. Now if limits are reduced suddenly - * we take into account all the IO dispatched so far at new - * low rate and * newly queued IO gets a really long dispatch - * time. - * - * So keep on trimming slice even if bio is not queued. - */ - throtl_trim_slice(tg, rw); - - /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we''re already at the top, it * can be executed directly. @@ -1552,10 +1366,11 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) } /* out-of-limit, queue to @tg */ - throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", + throtl_log(sq, + "[%c] btk=%llu sz=%u bps=%llu iotk=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', - tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], - tg->io_disp[rw], tg->iops[rw], + tg->bytes_token[rw], bio->bi_size, tg->bps[rw], + tg->io_token[rw], tg->iops[rw], sq->nr_queued[READ], sq->nr_queued[WRITE]); bio_associate_current(bio); -- 1.8.1.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/