Hi, all
range-bw was implemented as another I/O scheduling policy of dm-ioband
to support predicable I/O bandwidth between minimum and maximum
bandwidth defined by administrator. So, basic advantages and defects
are same with dm-ioband. Here, minimum I/O bandwidth should be
guaranteed for stable performance or reliability of specific process
groups and I/O bandwidth over maximum should be throttled to protect
the limited I/O resource from over-provisioning in unnecessary usage
or to reserve the I/O bandwidth for another use.
Recently, as many contributors have discussed about similar issues
through the container ML, administrator of several business site
requires the predicable and consistent I/O bandwidth regardless of
system specification (especially disk) and he/she wants the method to
limit I/O bandwidth used by less important service or process group
such like io-throttle by Andrea , at the same time. I think the
proportional I/O bandwidth is also important such like
weight/io-weight policy in dm-ioband.
So, range-bw was implemented to include the two concepts, guaranteeing
of minimum bandwidth and limitation of maximum bandwidth according to
the importance or priority of specific process groups.
range-bw is based on newest version of dm-ioband, bio-cgroup V7(4
patch files), dm-ioband-V1.10.3(1 patch file) and these can be
referred in
http://people.valinux.co.jp/~ryov/dm-ioband/
http://people.valinux.co.jp/~ryov/bio-cgroup/
and the below range-bw patch file(dm-ioband-rangebw-1.10.3.patch)
including Ryo?s patch set is also referred in:
http://www.corsetproject.net/browser/corset_source_code/resource_controllers/disk_controller/Range-BW-for-dmioband-V1.10.3
You have to apply this(dm-ioband-rangebw-1.10.3.patch) patch file
after applying dm-ioband and bio-cgroup patches.
The released range-bw may have some problems and improper code
although I try to test heavily. It is first release ^^
And it is required to reduce the overhead of I/O scheduling and to
optimize the source code.
Any comments or advices is welcome
Ryo Tsuruta, Can you check this patch file ?
for convenience, patch file is attached in this mail.
USAGE Example -------------------------------------------------
The basic usage is same with dm-ioband by Ryo Tsuruta. So, it is
helpful to refer the usage of dm-ioband.
# mount the cgroup for range-bw
mount -t cgroup -o bio none /root/cgroup/bio
# create the 3 groups
mkdir /root/cgroup/bio/bgroup1
mkdir /root/cgroup/bio/bgroup2
mkdir /root/cgroup/bio/bgroup3
# create the ioband device ( name : ioband1 )
echo "0 $(blockdev --getsize /dev/sdb2) ioband /dev/sdb2 1 0 0 none
range-bw 0 :0" | dmsetup create ioband1
# init ioband device ( type and policy )
dmsetup message ioband1 0 type cgroup
dmsetup message ioband1 0 policy range-bw
# attach the groups to the ioband device
dmsetup message ioband1 0 attach 2
dmsetup message ioband1 0 attach 3
dmsetup message ioband1 0 attach 4
# allocate the values ( min-bw and max-bw ) : XXX Kbytes
# range : about 3~5Mbytes
dmsetup message ioband1 0 min-bw 2:3000
dmsetup message ioband1 0 max-bw 2:5000
# range : about 7~10Mbytes
dmsetup message ioband1 0 min-bw 3:7000
dmsetup message ioband1 0 max-bw 3:10000
# range : about 15~20Mbytes
dmsetup message ioband1 0 min-bw 4:15000
dmsetup message ioband1 0 max-bw 4:20000
This patch file is for range-bw policy in
dm-ioband.---------------------------------------------------------------
diff -urN linux-2.6.30-rc1-orig/drivers/md/dm-ioband-ctl.c
linux-2.6.30-rc1/drivers/md/dm-ioband-ctl.c
--- linux-2.6.30-rc1-orig/drivers/md/dm-ioband-ctl.c 2009-04-20
18:16:00.000000000 +0900
+++ linux-2.6.30-rc1/drivers/md/dm-ioband-ctl.c 2009-04-21
13:00:41.000000000 +0900
@@ -789,6 +789,17 @@
}
/*
+ * Check the overflow of maximum bandwidth limit
+ */
+static int is_no_io_mode(struct ioband_group * gp)
+{
+ if(gp->c_io_mode == NO_IO_MODE)
+ return 1;
+
+ return 0;
+}
+
+/*
* Start to control the bandwidth once the number of uncompleted BIOs
* exceeds the value of "io_throttle".
*/
@@ -799,9 +810,24 @@
struct ioband_device *dp = gp->c_banddev;
unsigned long flags;
int direct;
+ unsigned long now, time_step;
spin_lock_irqsave(&dp->g_lock, flags);
+ gp = ioband_group_get(gp, bio);
+
+ /*
+ * This part is for controlling the max bandwidth of range-bw policy
+ */
+ if(is_no_io_mode(gp)){
+ now = jiffies;
+ if(time_after(dp->g_next_time_period, now)){
+ time_step = dp->g_next_time_period - now;
+ range_bw_timer_register(gp->c_timer,
(time_step + TIME_COMPENSATOR), (unsigned long)gp);
+ wait_event_lock_irq(gp->c_max_bw_over_waitq,
!is_no_io_mode(gp), dp->g_lock, do_nothing());
+ }
+ }
+
/*
* The device is suspended while some of the ioband device
* configurations are being changed.
@@ -811,7 +837,6 @@
!is_device_suspended(dp), dp->g_lock,
do_nothing());
- gp = ioband_group_get(gp, bio);
prevent_burst_bios(gp, bio);
if (should_pushback_bio(gp)) {
spin_unlock_irqrestore(&dp->g_lock, flags);
diff -urN linux-2.6.30-rc1-orig/drivers/md/dm-ioband.h
linux-2.6.30-rc1/drivers/md/dm-ioband.h
--- linux-2.6.30-rc1-orig/drivers/md/dm-ioband.h 2009-04-20
18:16:00.000000000 +0900
+++ linux-2.6.30-rc1/drivers/md/dm-ioband.h 2009-04-20 18:12:50.000000000 +0900
@@ -8,6 +8,10 @@
#include <linux/version.h>
#include <linux/wait.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/time.h>
+#include <linux/timer.h>
#define DM_MSG_PREFIX "ioband"
@@ -16,6 +20,12 @@
#define IOBAND_NAME_MAX 31
#define IOBAND_ID_ANY (-1)
+#define MAX_BW_OVER 1
+#define MAX_BW_UNDER 0
+#define NO_IO_MODE 4
+
+#define TIME_COMPENSATOR 10
+
struct ioband_group;
struct ioband_device {
@@ -74,6 +84,16 @@
int g_token_left;
/* left-over tokens from the previous epoch */
int g_token_extra;
+
+ /* members for range-bw policy */
+ int g_min_bw_total;
+ unsigned long g_next_time_period;
+ int g_time_period_expired;
+ struct ioband_group * g_running_gp;
+ int g_total_min_bw_token;
+ int g_consumed_min_bw_token;
+ int g_io_mode;
+
};
struct ioband_group_stat {
@@ -110,8 +130,31 @@
/* rfu */
/* struct bio_list c_ordered_tag_bios; */
+
+ /* members for range-bw policy */
+ wait_queue_head_t c_max_bw_over_waitq;
+ spinlock_t c_lock;
+ struct timer_list * c_timer;
+ int timer_set;
+ int c_min_bw;
+ int c_max_bw;
+ int c_time_slice_expired;
+ int c_min_bw_token;
+ int c_max_bw_token;
+ int c_consumed_min_bw_token;
+ int c_is_over_max_bw;
+ int c_io_mode;
+ unsigned long c_time_slice;
+ unsigned long c_time_slice_start;
+ unsigned long c_time_slice_end;
+ int c_wait_p_count;
+
};
+extern void range_bw_timeover(unsigned long gp);
+extern void range_bw_timer_register(struct timer_list * ptimer,
unsigned long timeover, unsigned long gp);
+extern int policy_range_bw_init(struct ioband_device *dp, int argc,
char **argv);
+
#define IOBAND_URGENT 1
#define DEV_BIO_BLOCKED 1
diff -urN linux-2.6.30-rc1-orig/drivers/md/dm-ioband-policy.c
linux-2.6.30-rc1/drivers/md/dm-ioband-policy.c
--- linux-2.6.30-rc1-orig/drivers/md/dm-ioband-policy.c 2009-04-20
18:16:00.000000000 +0900
+++ linux-2.6.30-rc1/drivers/md/dm-ioband-policy.c 2009-04-20
16:12:55.000000000 +0900
@@ -453,5 +453,6 @@
{"default", policy_default_init},
{"weight", policy_weight_init},
{"weight-iosize", w2_policy_weight_init},
+ {"range-bw", policy_range_bw_init},
{NULL, policy_default_init}
};
diff -urN linux-2.6.30-rc1-orig/drivers/md/dm-ioband-rangebw.c
linux-2.6.30-rc1/drivers/md/dm-ioband-rangebw.c
--- linux-2.6.30-rc1-orig/drivers/md/dm-ioband-rangebw.c 1970-01-01
09:00:00.000000000 +0900
+++ linux-2.6.30-rc1/drivers/md/dm-ioband-rangebw.c 2009-04-20
16:00:09.000000000 +0900
@@ -0,0 +1,562 @@
+/*
+ * dm-ioband-rangebw.c
+ *
+ * This is a I/O control policy to support the Range Bandwidth in Disk I/O.
+ * And this policy is for dm-ioband controller by Ryo Tsuruta,
Hirokazu Takahashi
+ *
+ * Copyright (C) 2008 - 2011 Electronics and Telecommunications
Research Institute(ETRI)
+ *
+ * This program is free software. you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License(GPL) as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * Contact Information:
+ * Dong-Jae, Kang <[email protected]>, Chei-Yol,Kim <[email protected]>,
+ * Sung-In,Jung <[email protected]>
+ */
+
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/rbtree.h>
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-ioband.h"
+
+/*
+ * Functions for Range Bandwidth(range-bw) policy based on the time
slice and token.
+ */
+#define DEFAULT_BUCKET 2
+#define DEFAULT_TOKENPOOL 2048
+
+#define DEFAULT_MIN_BW 0
+#define TIME_SLICE_EXPIRED 1
+#define TIME_SLICE_NOT_EXPIRED 0
+
+#define MINBW_IO_MODE 0
+#define LEFTOVER_IO_MODE 1
+#define RANGE_IO_MODE 2
+#define DEFAULT_IO_MODE 3
+#define NO_IO_MODE 4
+
+#define MINBW_PRIO_BASE 10
+#define OVER_IO_RATE 4
+
+static const int time_slice_base = HZ / 10;
+static const int leftover_time_slice_base = HZ / 25;
+static const int range_time_slice_base = HZ / 50;
+
+/*
+ * g_restart_bios function for range-bw policy
+ */
+static int range_bw_restart_bios(struct ioband_device *dp)
+{
+ return 1;
+}
+
+/*
+ * Allocate the time slice when IO mode is MINBW_IO_MODE,
RANGE_IO_MODE or LEFTOVER_IO_MODE
+ */
+static int set_time_slice(struct ioband_group * gp)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ int dp_io_mode, gp_io_mode;
+ unsigned long now = jiffies;
+
+ dp_io_mode = dp->g_io_mode;
+ gp_io_mode = gp->c_io_mode;
+
+ gp->c_time_slice_start = now;
+
+ if(dp_io_mode == LEFTOVER_IO_MODE){
+ gp->c_time_slice_end = now + leftover_time_slice_base;
+ return 0;
+ }
+
+ if(gp_io_mode == MINBW_IO_MODE)
+ gp->c_time_slice_end = now + gp->c_time_slice;
+ else if(gp_io_mode == RANGE_IO_MODE)
+ gp->c_time_slice_end = now + range_time_slice_base;
+ else // gp_io_mode == DEFAULT_IO_MODE
+ gp->c_time_slice_end = now + time_slice_base;
+
+ gp->c_time_slice_expired = TIME_SLICE_NOT_EXPIRED;
+
+ return 0;
+}
+
+/*
+ * Calculate the priority of given ioband_group
+ */
+static int range_bw_priority(struct ioband_group * gp)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ int prio = 0;
+
+ if(dp->g_io_mode == LEFTOVER_IO_MODE){
+ prio = random32() % MINBW_PRIO_BASE;
+ if(prio == 0)
+ prio = 1;
+ }else if(gp->c_io_mode == MINBW_IO_MODE){
+ prio = (gp->c_min_bw_token - gp->c_consumed_min_bw_token) * MINBW_PRIO_BASE;
+ }else if(gp->c_io_mode == DEFAULT_IO_MODE){
+ prio = MINBW_PRIO_BASE;
+ }else if(gp->c_io_mode == RANGE_IO_MODE){
+ prio = MINBW_PRIO_BASE / 2;
+ }else{
+ prio = 0;
+ }
+
+ return prio;
+}
+
+/*
+ * Check whether this group has right to issue an I/O in range-bw policy mode.
+ * Return 0 if it doesn't have right, otherwise return the non-zero value.
+ */
+static int has_right_to_issue(struct ioband_group *gp)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ int prio;
+
+ if(gp->c_prio_blocked > 0 || gp->c_blocked - gp->c_prio_blocked > 0){
+ prio = range_bw_priority(gp);
+ if(prio <= 0)
+ return 1;
+ return prio;
+ }
+
+ if(gp == dp->g_running_gp){
+
+ if(gp->c_time_slice_expired == TIME_SLICE_EXPIRED){
+
+ gp->c_time_slice_expired = TIME_SLICE_NOT_EXPIRED;
+ gp->c_time_slice_end = 0;
+
+ return 0;
+ }
+
+ if(gp->c_time_slice_end == 0)
+ set_time_slice(gp);
+
+ return range_bw_priority(gp);
+
+ }
+
+ dp->g_running_gp = gp;
+ set_time_slice(gp);
+
+ return range_bw_priority(gp);
+}
+
+/*
+ * Reset all variables related with range-bw token and time slice
+ */
+static int reset_range_bw_token(struct ioband_group * gp, unsigned long now)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ struct ioband_group *p;
+
+ list_for_each_entry(p, &dp->g_groups, c_list){
+ p->c_consumed_min_bw_token = 0;
+ p->c_is_over_max_bw = MAX_BW_UNDER;
+ if(p->c_io_mode != DEFAULT_IO_MODE)
+ p->c_io_mode = MINBW_IO_MODE;
+ }
+
+ dp->g_consumed_min_bw_token = 0;
+
+ dp->g_next_time_period = now + HZ;
+ dp->g_time_period_expired = TIME_SLICE_NOT_EXPIRED;
+ dp->g_io_mode = MINBW_IO_MODE;
+
+ list_for_each_entry(p, &dp->g_groups, c_list){
+ if(waitqueue_active(&p->c_max_bw_over_waitq))
+ wake_up_all(&p->c_max_bw_over_waitq);
+ }
+ return 0;
+}
+
+/*
+ * Use tokens(Increase the number of consumed token) to issue an I/O
for guranteeing the range-bw.
+ * and check the expiration of local and global time slice, and
overflow of max bw
+ */
+static int range_bw_consume_token(struct ioband_group *gp, int count, int flag)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ struct ioband_group *p;
+ unsigned long now = jiffies;
+ int io_mode;
+
+ dp->g_current = gp;
+
+ if(dp->g_next_time_period == 0){
+ dp->g_next_time_period = now + HZ;
+ dp->g_time_period_expired = TIME_SLICE_NOT_EXPIRED;
+ }
+
+ if(time_after(now, dp->g_next_time_period)){
+ reset_range_bw_token(gp, now);
+ }else{
+ gp->c_consumed_min_bw_token += count;
+ dp->g_consumed_min_bw_token += count;
+
+ if(gp->c_max_bw > 0 && gp->c_consumed_min_bw_token >=
gp->c_max_bw_token){
+ gp->c_is_over_max_bw = MAX_BW_OVER;
+ gp->c_io_mode = NO_IO_MODE;
+ return R_YIELD;
+ }
+
+ if(gp->c_io_mode != RANGE_IO_MODE &&
gp->c_min_bw_token <= gp->c_consumed_min_bw_token){
+ gp->c_io_mode = RANGE_IO_MODE;
+
+ if(dp->g_total_min_bw_token <=
dp->g_consumed_min_bw_token){
+ list_for_each_entry(p, &dp->g_groups, c_list){
+ if(p->c_io_mode ==
RANGE_IO_MODE || p->c_io_mode == DEFAULT_IO_MODE){
+ io_mode = 1;
+ }else{
+ io_mode = 0;
+ break;
+ }
+ }
+
+ if(io_mode && dp->g_io_mode == MINBW_IO_MODE)
+ dp->g_io_mode = LEFTOVER_IO_MODE;
+ }
+ }
+ }
+
+ if(gp->c_time_slice_end != 0 && time_after(now, gp->c_time_slice_end)){
+ gp->c_time_slice_expired = TIME_SLICE_EXPIRED;
+ return R_YIELD;
+ }
+
+ return R_OK;
+}
+
+/*
+ * Check if this group is able to receive a new bio.
+ * in range bw policy, we only check that ioband device should be blocked
+ */
+static int range_bw_queue_full(struct ioband_group *gp)
+{
+
+ return (gp->c_blocked >= gp->c_limit);
+}
+
+/*
+ * Convert the bw valuse to the number of bw token
+ * bw : Kbyte unit bandwidth
+ * token_base : the number of tokens used for one 1Kbyte-size IO
+ * -- Attention : Currently, We support the 512byte or 1Kbyte per 1 token
+ */
+static int convert_bw_to_token(int bw, int token_unit)
+{
+ int token;
+ int token_base;
+
+ token_base = (1 << token_unit) / 4; // token_base is 1 or 2
+ token = bw * token_base;
+
+ return token;
+}
+
+
+/*
+ * Allocate the time slice for MINBW_IO_MODE to each group
+ */
+static void range_bw_time_slice_init(struct ioband_group * gp)
+{
+ struct ioband_device * dp = gp->c_banddev;
+ struct ioband_group * p;
+
+ list_for_each_entry(p, &dp->g_groups, c_list){
+ if(p->c_min_bw == 0)
+ p->c_time_slice = time_slice_base;
+ else
+ p->c_time_slice = time_slice_base + ((time_slice_base *
p->c_min_bw) / dp->g_min_bw_total);
+ }
+}
+
+
+/*
+ * Allocate the min_bw and min_bw_token to the given group
+ */
+static void set_min_bw(struct ioband_group *gp, int new)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ struct ioband_group * p;
+ int token_unit;
+
+ dp->g_min_bw_total += (new - gp->c_min_bw);
+ gp->c_min_bw = new;
+
+ if(new)
+ gp->c_io_mode = MINBW_IO_MODE;
+ else
+ gp->c_io_mode = DEFAULT_IO_MODE;
+
+ range_bw_time_slice_init(gp);
+
+ token_unit = dp->g_token_unit;
+ gp->c_min_bw_token = convert_bw_to_token(gp->c_min_bw, token_unit);
+ dp->g_total_min_bw_token = convert_bw_to_token(dp->g_min_bw_total,
token_unit);
+
+ if (dp->g_min_bw_total == 0) {
+ list_for_each_entry(p, &dp->g_groups, c_list)
+ p->c_limit = 1;
+ } else {
+ list_for_each_entry(p, &dp->g_groups, c_list) {
+ p->c_limit = (dp->g_io_limit[0] + dp->g_io_limit[1]) *
+ p->c_min_bw / dp->g_min_bw_total /
+ OVER_IO_RATE + 1;
+ }
+ }
+
+ return;
+}
+
+/*
+ * Allocate the max_bw and max_bw_token to the pointed group
+ */
+static void set_max_bw(struct ioband_group *gp, int new)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ int token_unit;
+
+ token_unit = dp->g_token_unit;
+
+ gp->c_max_bw = new;
+ gp->c_max_bw_token = convert_bw_to_token(new, token_unit);
+
+ return;
+
+}
+
+static void init_range_bw_token_bucket(struct ioband_device *dp, int val)
+{
+ dp->g_token_bucket = ((dp->g_io_limit[0] + dp->g_io_limit[1]) *
+ DEFAULT_BUCKET) << dp->g_token_unit;
+ if (!val)
+ val = DEFAULT_TOKENPOOL << dp->g_token_unit;
+ if (val < dp->g_token_bucket)
+ val = dp->g_token_bucket;
+ dp->g_carryover = val/dp->g_token_bucket;
+ dp->g_token_left = 0;
+}
+
+static int policy_range_bw_param(struct ioband_group *gp, char *cmd,
char *value)
+{
+ long val;
+ int r = 0, err;
+
+ err = strict_strtol(value, 0, &val);
+
+ if (!strcmp(cmd, "min-bw")) {
+ if (!err && 0 <= val && val <= (INT_MAX / 2))
+ set_min_bw(gp, val);
+ else
+ r = -EINVAL;
+ } else if (!strcmp(cmd, "max-bw")) {
+ if ((!err && 0 <= val && val <= (INT_MAX / 2) && gp->c_min_bw <=
val) || val == 0)
+ set_max_bw(gp, val);
+ else
+ r = -EINVAL;
+ } else {
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+static int policy_range_bw_ctr(struct ioband_group *gp, char *arg)
+{
+ int ret;
+
+ if (!arg)
+ arg = __stringify(DEFAULT_MIN_BW);
+
+ init_waitqueue_head(&gp->c_max_bw_over_waitq);
+ spin_lock_init(&gp->c_lock);
+
+ gp->c_min_bw = 0;
+ gp->c_max_bw = 0;
+ gp->c_io_mode = DEFAULT_IO_MODE;
+ gp->c_time_slice_expired = TIME_SLICE_NOT_EXPIRED;
+ gp->c_min_bw_token = 0;
+ gp->c_max_bw_token = 0;
+ gp->c_consumed_min_bw_token = 0;
+ gp->c_is_over_max_bw = MAX_BW_UNDER;
+ gp->c_time_slice_start = 0;
+ gp->c_time_slice_end = 0;
+ gp->c_wait_p_count = 0;//
+
+ gp->c_time_slice = time_slice_base;
+
+ gp->c_timer = kmalloc(sizeof(struct timer_list), GFP_KERNEL);
+ if(gp->c_timer == NULL) return -EINVAL; // return value may
be not appropriate !!!
+ memset(gp->c_timer, 0, sizeof(struct timer_list));
+ gp->timer_set = 0;
+
+ ret = policy_range_bw_param(gp, "min-bw", arg);
+
+ return ret;
+}
+
+static void policy_range_bw_dtr(struct ioband_group *gp)
+{
+ struct ioband_device *dp = gp->c_banddev;
+
+ set_min_bw(gp, 0);
+ gp->c_time_slice = 0;
+ set_max_bw(gp, 0);
+
+ dp->g_running_gp = NULL;
+
+ if(gp->c_timer != NULL){
+ del_timer(gp->c_timer);
+ kfree(gp->c_timer);
+ }
+}
+
+static void policy_range_bw_show(struct ioband_group *gp, int *szp,
+ char *result, unsigned int maxlen)
+{
+ struct ioband_group *p;
+ struct ioband_device *dp = gp->c_banddev;
+ struct rb_node *node;
+ int sz = *szp; /* used in DMEMIT() */
+
+ DMEMIT(" %d :%d", dp->g_token_bucket * dp->g_carryover, gp->c_min_bw);
+
+ for (node = rb_first(&gp->c_group_root); node; node = rb_next(node)) {
+ p = rb_entry(node, struct ioband_group, c_group_node);
+ DMEMIT(" %d:%d:%d", p->c_id, p->c_min_bw, p->c_max_bw);
+ }
+ *szp = sz;
+}
+
+static int range_bw_prepare_token(struct ioband_group *gp, struct bio
*bio, int flag)
+{
+ struct ioband_device *dp = gp->c_banddev;
+ int unit;
+ int bio_count;
+ int token_count = 0;
+
+ unit = (1 << dp->g_token_unit);
+ bio_count = bio_sectors(bio);
+
+ if(unit == 8) // 1 token is used for 512 bytes
+ token_count = bio_count;
+ else if(unit == 4) // 1 token is used for 1024 bytes
+ token_count = bio_count / 2;
+ else if(unit == 2) // 1 token is used for 2048 bytes
+ token_count = bio_count / 4;
+ else if(unit == 1) // 1 token is used for 4096 bytes
+ token_count = bio_count / 8;
+
+ return range_bw_consume_token(gp, token_count, flag);
+}
+
+void range_bw_timer_register(struct timer_list * ptimer, unsigned
long timeover, unsigned long gp)
+{
+ struct ioband_group * group = (struct ioband_group *)gp;
+
+ if(group->timer_set == 0){
+ init_timer(ptimer);
+ ptimer->expires = get_jiffies_64() + timeover;
+ ptimer->data = gp;
+ ptimer->function = range_bw_timeover;
+ add_timer(ptimer);
+ group->timer_set = 1;
+ }
+}
+
+/*
+ * Timer Handler function to protect the all processes's hanging in
lower min-bw configuration
+ */
+void range_bw_timeover(unsigned long gp)
+{
+ struct ioband_group * group = (struct ioband_group *)gp;
+
+ if(group->c_is_over_max_bw == MAX_BW_OVER){
+ group->c_is_over_max_bw = MAX_BW_UNDER;
+ }
+
+ if(group->c_io_mode == NO_IO_MODE){
+ group->c_io_mode = MINBW_IO_MODE;
+ }
+
+ if(waitqueue_active(&group->c_max_bw_over_waitq))
+ wake_up_all(&group->c_max_bw_over_waitq);
+
+ group->timer_set = 0;
+}
+
+/*
+ * <Method> <description>
+ * g_can_submit : To determine whether a given group has the right to
+ * submit BIOs. The larger the return value the higher the
+ * priority to submit. Zero means it has no right.
+ * g_prepare_bio : Called right before submitting each BIO.
+ * g_restart_bios : Called if this ioband device has some BIOs blocked but none
+ * of them can be submitted now. This method has to
+ * reinitialize the data to restart to submit BIOs and return
+ * 0 or 1.
+ * The return value 0 means that it has become able to submit
+ * them now so that this ioband device will continue its work.
+ * The return value 1 means that it is still unable to submit
+ * them so that this device will stop its work. And this
+ * policy module has to reactivate the device when it gets
+ * to be able to submit BIOs.
+ * g_hold_bio : To hold a given BIO until it is submitted.
+ * The default function is used when this method is undefined.
+ * g_pop_bio : To select and get the best BIO to submit.
+ * g_group_ctr : To initalize the policy own members of struct ioband_group.
+ * g_group_dtr : Called when struct ioband_group is removed.
+ * g_set_param : To update the policy own date.
+ * The parameters can be passed through "dmsetup message"
+ * command.
+ * g_should_block : Called every time this ioband device receive a BIO.
+ * Return 1 if a given group can't receive any more BIOs,
+ * otherwise return 0.
+ * g_show : Show the configuration.
+ */
+
+int policy_range_bw_init(struct ioband_device *dp,
+ int argc, char **argv)
+{
+ long val;
+ int r = 0;
+
+ if (argc < 1)
+ val = 0;
+ else {
+ r = strict_strtol(argv[0], 0, &val);
+ if (r || val < 0)
+ return -EINVAL;
+ }
+
+ dp->g_can_submit = has_right_to_issue;
+ dp->g_prepare_bio = range_bw_prepare_token;
+ dp->g_restart_bios = range_bw_restart_bios;
+ dp->g_group_ctr = policy_range_bw_ctr;
+ dp->g_group_dtr = policy_range_bw_dtr;
+ dp->g_set_param = policy_range_bw_param;
+ dp->g_should_block = range_bw_queue_full;
+ dp->g_show = policy_range_bw_show;
+
+ dp->g_min_bw_total = 0;
+ dp->g_running_gp = NULL;
+ dp->g_total_min_bw_token = 0;
+ dp->g_io_mode = MINBW_IO_MODE;
+ dp->g_consumed_min_bw_token = 0;
+ dp->g_current = NULL;
+ dp->g_next_time_period = 0;
+ dp->g_time_period_expired = TIME_SLICE_NOT_EXPIRED;
+
+ dp->g_token_unit = PAGE_SHIFT - 9;
+ init_range_bw_token_bucket(dp, val);
+
+ return 0;
+}
diff -urN linux-2.6.30-rc1-orig/drivers/md/Makefile
linux-2.6.30-rc1/drivers/md/Makefile
--- linux-2.6.30-rc1-orig/drivers/md/Makefile 2009-04-20
18:16:00.000000000 +0900
+++ linux-2.6.30-rc1/drivers/md/Makefile 2009-04-20 16:42:47.000000000 +0900
@@ -8,7 +8,7 @@
dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-snap-persistent.o
dm-mirror-y += dm-raid1.o
-dm-ioband-y += dm-ioband-ctl.o dm-ioband-policy.o dm-ioband-type.o
+dm-ioband-y += dm-ioband-ctl.o dm-ioband-policy.o dm-ioband-type.o
dm-ioband-rangebw.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
--
Best Regards,
Dong-Jae Kang
Hi Dong-Jae,
> range-bw is based on newest version of dm-ioband, bio-cgroup V7(4
> patch files), dm-ioband-V1.10.3(1 patch file) and these can be
> referred in
> http://people.valinux.co.jp/~ryov/dm-ioband/
> http://people.valinux.co.jp/~ryov/bio-cgroup/
> and the below range-bw patch file(dm-ioband-rangebw-1.10.3.patch)
> including Ryo’s patch set is also referred in:
> http://www.corsetproject.net/browser/corset_source_code/resource_controllers/disk_controller/Range-BW-for-dmioband-V1.10.3
> You have to apply this(dm-ioband-rangebw-1.10.3.patch) patch file
> after applying dm-ioband and bio-cgroup patches.
>
> The released range-bw may have some problems and improper code
> although I try to test heavily. It is first release ^^
> And it is required to reduce the overhead of I/O scheduling and to
> optimize the source code.
> Any comments or advices is welcome
>
> Ryo Tsuruta, Can you check this patch file ?
> for convenience, patch file is attached in this mail.
I took a quick look at your patch. It seems to be no problem for
existing dm-ioband code, but I would suggest you that you had better
use checkpatch.pl to check for your coding style.
The patch could be applied and compiled successfully. I did a simple
test, running fio on each cgroup in 30 seconds simultaneously, and got
the following results.
w/o range-bw w/ range-bw (min&max-bw settings)
cgroup1 331KB/s 102KB/s (100KB)
cgroup2 331KB/s 196KB/s (200KB)
Do you have any benchmark resutls? I'd be very interested to see it.
Thanks,
Ryo Tsuruta
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
Hi Ryo Tsuruta,
2009/4/24 Ryo Tsuruta <[email protected]>:
> Hi Dong-Jae,
>
>> range-bw is based on newest version of dm-ioband, bio-cgroup V7(4
>> patch files), dm-ioband-V1.10.3(1 patch file) and these can be
>> referred in
>> http://people.valinux.co.jp/~ryov/dm-ioband/
>> http://people.valinux.co.jp/~ryov/bio-cgroup/
>> and the below range-bw patch file(dm-ioband-rangebw-1.10.3.patch)
>> including Ryo?s patch set is also referred in:
>> http://www.corsetproject.net/browser/corset_source_code/resource_controllers/disk_controller/Range-BW-for-dmioband-V1.10.3
>> You have to apply this(dm-ioband-rangebw-1.10.3.patch) patch file
>> after applying dm-ioband and bio-cgroup patches.
>>
>> The released range-bw may have some problems and improper code
>> although I try to test heavily. It is first release ^^
>> And it is required to reduce the overhead of I/O scheduling and to
>> optimize the source code.
>> Any comments or advices is welcome
>>
>> Ryo Tsuruta, Can you check this patch file ?
>> for convenience, patch file is attached in this mail.
>
> I took a quick look at your patch. It seems to be no problem for
> existing dm-ioband code, but I would suggest you that you had better
> use checkpatch.pl to check for your coding style.
> The patch could be applied and compiled successfully. I did a simple
> test, running fio on each cgroup in 30 seconds simultaneously, and got
> the following results.
>
> ? ? ? ? ? ? ? w/o range-bw ? ? w/ range-bw (min&max-bw settings)
> cgroup1 ? ? ? ? ?331KB/s ? ? ? ? ? ? 102KB/s (100KB)
> cgroup2 ? ? ? ? ?331KB/s ? ? ? ? ? ? 196KB/s (200KB)
>
> Do you have any benchmark resutls? I'd be very interested to see it.
>
> Thanks,
> Ryo Tsuruta
>
Thank you for your comments about range-bw
and your recommendation of checkpatch.pl seems to be useful for me.
I wonder your test configuration and environment.
Your result is some strange for me, because I didn't use the testing tool, fio.
I have used xdd, bonnie++ and tiobench during the test.
So, I will try to evaluate range-bw using fio, to do that, I need to
know your configuration and basic environment in briefly
if it don't bother you, can you give me the information?
and I attached the result of basic evaluation of range-bw using xdd
6.5.(Test Result for range-bw_english.pdf)
it was performed to evaluate the basic functionalities in one process per group.
actually, more evaluation is need in specfic envinronment like as
massive I/O by huge processes in each group.
and it is going on now
--
Best Regards,
Dong-Jae Kang
Hi Dong-Jae,
> I wonder your test configuration and environment.
> Your result is some strange for me, because I didn't use the testing tool, fio.
> I have used xdd, bonnie++ and tiobench during the test.
>
> So, I will try to evaluate range-bw using fio, to do that, I need to
> know your configuration and basic environment in briefly
> if it don't bother you, can you give me the information?
I used fio 1.22. The below is a script which I actually ran.
#!/bin/sh
dev1=/dev/sdb4 # SATA disk
echo "0 $(sudo blockdev --getsize $dev1) ioband $dev1 share2 0 0
cgroup range-bw 0" | sudo dmsetup create ioband1
arg="--time_based --runtime=10 --ioengine=libaio --iodepth=50 \
--direct=1 --norandommap"
dmsetup message ioband1 0 attach 2
dmsetup message ioband1 0 attach 3
dmsetup message ioband1 0 min-bw 2:100
dmsetup message ioband1 0 max-bw 2:100
dmsetup message ioband1 0 min-bw 3:200
dmsetup message ioband1 0 max-bw 3:200
echo $$ > /cgroup/grp1/tasks
fio $arg --rw=randread --name=grp1 --filename=/dev/mapper/ioband1 \
--output=r2-1.log &
echo $$ > /cgroup/grp2/tasks
fio $arg --rw=randread --name=grp2 --filename=/dev/mapper/ioband1 \
--output=r2-2.log &
wait
Please feel free to ask me if you need more information.
> and I attached the result of basic evaluation of range-bw using xdd
> 6.5.(Test Result for range-bw_english.pdf)
> it was performed to evaluate the basic functionalities in one process per group.
Thanks.
> actually, more evaluation is need in specfic envinronment like as
> massive I/O by huge processes in each group.
> and it is going on now
I'm look forward to seeing the results.
Thanks,
Ryo Tsuruta
Hi Ryo
I am sorry for delayed reply.
I tested range-bw policy as you send me your configuration
and I found some logical bugs in range-bw including big scheduling overhead.
> I used fio 1.22. The below is a script which I actually ran.
>
> ?#!/bin/sh
> ?dev1=/dev/sdb4 # SATA disk
> ?echo "0 $(sudo blockdev --getsize $dev1) ioband $dev1 share2 0 0
> ? ?cgroup range-bw 0" | sudo dmsetup create ioband1
>
> ?arg="--time_based --runtime=10 --ioengine=libaio --iodepth=50 \
> ? ? ? --direct=1 --norandommap"
>
> ?dmsetup message ioband1 0 attach 2
> ?dmsetup message ioband1 0 attach 3
> ?dmsetup message ioband1 0 min-bw 2:100
> ?dmsetup message ioband1 0 max-bw 2:100
> ?dmsetup message ioband1 0 min-bw 3:200
> ?dmsetup message ioband1 0 max-bw 3:200
I recommend it is better to use different value in min-bw and max-bw
because its concept is to support range-bw with the minimum
guaranteeing and maximum limitation.
exampe - min-bw : 1024 / max-bw : 3072
> ?echo $$ > /cgroup/grp1/tasks
> ?fio $arg --rw=randread --name=grp1 --filename=/dev/mapper/ioband1 \
> ? ? ? ? ?--output=r2-1.log &
> ?echo $$ > /cgroup/grp2/tasks
> ?fio $arg --rw=randread --name=grp2 --filename=/dev/mapper/ioband1 \
> ? ? ? ? ?--output=r2-2.log &
> ?wait
>
> Please feel free to ask me if you need more information.
>
>
> I'm look forward to seeing the results.
>
Thanks for your comments.
And I will fix the bugs and re-release it after several days
at that time, I inform you about that.
Ryo,
now, I am using bio-cgroup.....
but, Is there additional things I have to consider to support AIO or
dealyed I/O?
--
Best Regards,
Dong-Jae Kang
Hi Dong-Jae,
From: Dong-Jae Kang <[email protected]>
Subject: Re: [PATCH] range-bw: Another I/O scheduling policy of dm-ioband supporting the predicable I/O bandwidth (range bandwidth)
Date: Thu, 30 Apr 2009 14:15:24 +0900
> Hi Ryo
>
> I am sorry for delayed reply.
> I tested range-bw policy as you send me your configuration
> and I found some logical bugs in range-bw including big scheduling overhead.
>
> > I used fio 1.22. The below is a script which I actually ran.
> >
> > ?#!/bin/sh
> > ?dev1=/dev/sdb4 # SATA disk
> > ?echo "0 $(sudo blockdev --getsize $dev1) ioband $dev1 share2 0 0
> > ? ?cgroup range-bw 0" | sudo dmsetup create ioband1
> >
> > ?arg="--time_based --runtime=10 --ioengine=libaio --iodepth=50 \
> > ? ? ? --direct=1 --norandommap"
> >
> > ?dmsetup message ioband1 0 attach 2
> > ?dmsetup message ioband1 0 attach 3
> > ?dmsetup message ioband1 0 min-bw 2:100
> > ?dmsetup message ioband1 0 max-bw 2:100
> > ?dmsetup message ioband1 0 min-bw 3:200
> > ?dmsetup message ioband1 0 max-bw 3:200
>
> I recommend it is better to use different value in min-bw and max-bw
> because its concept is to support range-bw with the minimum
> guaranteeing and maximum limitation.
> exampe - min-bw : 1024 / max-bw : 3072
>
> > ?echo $$ > /cgroup/grp1/tasks
> > ?fio $arg --rw=randread --name=grp1 --filename=/dev/mapper/ioband1 \
> > ? ? ? ? ?--output=r2-1.log &
> > ?echo $$ > /cgroup/grp2/tasks
> > ?fio $arg --rw=randread --name=grp2 --filename=/dev/mapper/ioband1 \
> > ? ? ? ? ?--output=r2-2.log &
> > ?wait
> >
> > Please feel free to ask me if you need more information.
> >
> >
> > I'm look forward to seeing the results.
> >
> Thanks for your comments.
> And I will fix the bugs and re-release it after several days
> at that time, I inform you about that.
>
> Ryo,
> now, I am using bio-cgroup.....
> but, Is there additional things I have to consider to support AIO or
> dealyed I/O?
bio-cgroup is renamed to bklio-cgroup. :)
I don't clearly understand about range-bw but I think that you don't
need to consider to support them and it should be taken care of by
blkio-cgroup and the common part of dm-ioband, not the policy
implementation. Do you have any problems with AIO or delayed write?
>
> --
> Best Regards,
> Dong-Jae Kang