From: Vivek Goyal <vgoyal@redhat.com>
To: nauman@google.com, dpshah@google.com, lizf@cn.fujitsu.com,
       mikew@google.com, fchecconi@gmail.com, paolo.valente@unimore.it,
       jens.axboe@oracle.com, ryov@valinux.co.jp, fernando@intellilink.co.jp,
       s-uchida@ap.jp.nec.com, taka@valinux.co.jp, guijianfeng@cn.fujitsu.com,
       arozansk@redhat.com, jmoyer@redhat.com, oz-kernel@redhat.com,
       dhaval@linux.vnet.ibm.com, balbir@linux.vnet.ibm.com,
       linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org
Cc: vgoyal@redhat.com, akpm@linux-foundation.org, menage@google.com,
       peterz@infradead.org
Subject: [PATCH 02/10] Common flat fair queuing code in elevaotor layer
Date: Wed, 11 Mar 2009 21:56:47 -0400
Message-Id: <1236823015-4183-3-git-send-email-vgoyal@redhat.com>
In-Reply-To: <1236823015-4183-1-git-send-email-vgoyal@redhat.com>
References: <1236823015-4183-1-git-send-email-vgoyal@redhat.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 77098
Lines: 2708

This is common fair queuing code in elevator layer. This is controlled by
config option CONFIG_ELV_FAIR_QUEUING. This patch initially only introduces
flat fair queuing support where there is only one group, "root group" and all
the tasks belong to root group.

This elevator layer changes are backward compatible. That means any ioscheduler
using old interfaces will continue to work.

Signed-off-by: Nauman Rafique <nauman@google.com>
Signed-off-by: Fabio Checconi <fabio@gandalf.sssup.it>
Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
---
 block/Kconfig.iosched    |   13 +
 block/Makefile           |    1 +
 block/blk-sysfs.c        |   10 +
 block/elevator-fq.c      | 1882 ++++++++++++++++++++++++++++++++++++++++++++++
 block/elevator-fq.h      |  479 ++++++++++++
 block/elevator.c         |   46 +-
 include/linux/blkdev.h   |    5 +
 include/linux/elevator.h |   48 ++
 8 files changed, 2473 insertions(+), 11 deletions(-)
 create mode 100644 block/elevator-fq.c
 create mode 100644 block/elevator-fq.h

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc..3398134 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -2,6 +2,19 @@ if BLOCK
 
 menu "IO Schedulers"
 
+config ELV_FAIR_QUEUING
+	bool "Elevator Fair Queuing Support"
+	default n
+	---help---
+	  Traditionally only cfq had notion of multiple queues and it did
+	  fair queuing at its own. With the cgroups and need of controlling
+	  IO, now even the simple io schedulers like noop, deadline, as will
+	  have one queue per cgroup and will need hierarchical fair queuing.
+	  Instead of every io scheduler implementing its own fair queuing
+	  logic, this option enables fair queuing in elevator layer so that
+	  other ioschedulers can make use of it.
+	  If unsure, say N.
+
 config IOSCHED_NOOP
 	bool
 	default y
diff --git a/block/Makefile b/block/Makefile
index bfe7304..6f410d5 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_ELV_FAIR_QUEUING)	+= elevator-fq.o
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e29ddfc..0d98c96 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -276,6 +276,13 @@ static struct queue_sysfs_entry queue_iostats_entry = {
 	.store = queue_iostats_store,
 };
 
+#ifdef CONFIG_ELV_FAIR_QUEUING
+static struct queue_sysfs_entry queue_slice_idle_entry = {
+	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_slice_idle_show,
+	.store = elv_slice_idle_store,
+};
+#endif
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -287,6 +294,9 @@ static struct attribute *default_attrs[] = {
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
+#ifdef CONFIG_ELV_FAIR_QUEUING
+	&queue_slice_idle_entry.attr,
+#endif
 	NULL,
 };
 
diff --git a/block/elevator-fq.c b/block/elevator-fq.c
new file mode 100644
index 0000000..a8addd1
--- /dev/null
+++ b/block/elevator-fq.c
@@ -0,0 +1,1882 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#include <linux/blkdev.h>
+#include "elevator-fq.h"
+#include <linux/blktrace_api.h>
+
+/* Values taken from cfq */
+const int elv_slice_sync = HZ / 10;
+int elv_slice_async = HZ / 25;
+const int elv_slice_async_rq = 2;
+int elv_slice_idle = HZ / 125;
+static struct kmem_cache *elv_ioq_pool;
+
+#define ELV_HW_QUEUE_MIN	(5)
+#define IO_SERVICE_TREE_INIT   ((struct io_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+/* Mainly the BFQ scheduling code Follows */
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
+{
+	return (s64)(a - b) > 0;
+}
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor.
+ */
+static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
+					bfq_weight_t weight)
+{
+	bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static inline void bfq_calc_finish(struct io_entity *entity,
+				   bfq_service_t service)
+{
+	BUG_ON(entity->weight == 0);
+
+	entity->finish = entity->start + bfq_delta(service, entity->weight);
+}
+
+static inline struct io_queue *io_entity_to_ioq(struct io_entity *entity)
+{
+	struct io_queue *ioq = NULL;
+
+	BUG_ON(entity == NULL);
+	if (entity->my_sched_data == NULL)
+		ioq = container_of(entity, struct io_queue, entity);
+	return ioq;
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static inline struct io_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct io_entity *entity = NULL;
+
+	if (node != NULL)
+		entity = rb_entry(node, struct io_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static inline void bfq_extract(struct rb_root *root, struct io_entity *entity)
+{
+	BUG_ON(entity->tree != root);
+
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct io_service_tree *st,
+				struct io_entity *entity)
+{
+	struct rb_node *next;
+	struct io_queue *ioq = io_entity_to_ioq(entity);
+
+	BUG_ON(entity->tree != &st->idle);
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	/* Delete queue from idle list */
+	if (ioq)
+		list_del(&ioq->queue_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct io_entity *entity)
+{
+	struct io_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	BUG_ON(entity->tree != NULL);
+
+	while (*node != NULL) {
+		parent = *node;
+		entry = rb_entry(parent, struct io_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static inline void bfq_update_min(struct io_entity *entity,
+					struct rb_node *node)
+{
+	struct io_entity *child;
+
+	if (node != NULL) {
+		child = rb_entry(node, struct io_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static inline void bfq_update_active_node(struct rb_node *node)
+{
+	struct io_entity *entity = rb_entry(node, struct io_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (parent == NULL)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right != NULL)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left != NULL)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct io_service_tree *st,
+					struct io_entity *entity)
+{
+	struct rb_node *node = &entity->rb_node;
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left != NULL)
+		node = node->rb_left;
+	else if (node->rb_right != NULL)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
+{
+	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+	return IOPRIO_BE_NR - ioprio;
+}
+
+void bfq_get_entity(struct io_entity *entity)
+{
+	struct io_queue *ioq = io_entity_to_ioq(entity);
+
+	if (ioq)
+		elv_get_ioq(ioq);
+}
+
+void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+	entity->ioprio = entity->new_ioprio;
+	entity->ioprio_class = entity->new_ioprio_class;
+	entity->sched_data = &iog->sched_data;
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (node->rb_right == NULL && node->rb_left == NULL)
+		deepest = rb_parent(node);
+	else if (node->rb_right == NULL)
+		deepest = node->rb_left;
+	else if (node->rb_left == NULL)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right != NULL)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct io_service_tree *st,
+				struct io_entity *entity)
+{
+	struct rb_node *node;
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node != NULL)
+		bfq_update_active_tree(node);
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct io_service_tree *st,
+					struct io_entity *entity)
+{
+	struct io_entity *first_idle = st->first_idle;
+	struct io_entity *last_idle = st->last_idle;
+	struct io_queue *ioq = io_entity_to_ioq(entity);
+
+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	/* Add this queue to idle list */
+	if (ioq)
+		list_add(&ioq->queue_list, &ioq->efqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue.  Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct io_service_tree *st,
+				struct io_entity *entity)
+{
+	struct io_queue *ioq = NULL;
+
+	BUG_ON(!entity->on_st);
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	ioq = io_entity_to_ioq(entity);
+	if (!ioq)
+		return;
+	elv_put_ioq(ioq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct io_service_tree *st,
+				struct io_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+void bfq_forget_idle(struct io_service_tree *st)
+{
+	struct io_entity *first_idle = st->first_idle;
+	struct io_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Active tree is empty. Pull back vtime to finish time of
+		 * last idle entity on idle tree.
+		 * Rational seems to be that it reduces the possibility of
+		 * vtime wraparound (bfq_gt(V-F) < 0).
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+
+static struct io_service_tree *
+__bfq_entity_update_prio(struct io_service_tree *old_st,
+				struct io_entity *entity)
+{
+	struct io_service_tree *new_st = old_st;
+
+	if (entity->ioprio_changed) {
+		entity->ioprio = entity->new_ioprio;
+		entity->ioprio_class = entity->new_ioprio_class;
+		entity->ioprio_changed = 0;
+
+		old_st->wsum -= entity->weight;
+		entity->weight = bfq_ioprio_to_weight(entity->ioprio);
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = io_entity_service_tree(entity);
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct io_entity *entity)
+{
+	struct io_sched_data *sd = entity->sched_data;
+	struct io_service_tree *st = io_entity_service_tree(entity);
+
+	if (entity == sd->active_entity) {
+		BUG_ON(entity->tree != NULL);
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->active_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_active entity below it.  We reuse the old
+		 * start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(st->vtime, entity->finish) ?
+				       st->vtime : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = st->vtime;
+		st->wsum += entity->weight;
+		bfq_get_entity(entity);
+
+		BUG_ON(entity->on_st);
+		entity->on_st = 1;
+	}
+
+	st = __bfq_entity_update_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity.
+ * @entity: the entity to activate.
+ */
+void bfq_activate_entity(struct io_entity *entity)
+{
+	__bfq_activate_entity(entity);
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ */
+int __bfq_deactivate_entity(struct io_entity *entity, int requeue)
+{
+	struct io_sched_data *sd = entity->sched_data;
+	struct io_service_tree *st = io_entity_service_tree(entity);
+	int was_active = entity == sd->active_entity;
+	int ret = 0;
+
+	if (!entity->on_st)
+		return 0;
+
+	BUG_ON(was_active && entity->tree != NULL);
+
+	if (was_active) {
+		bfq_calc_finish(entity, entity->service);
+		sd->active_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+	else if (entity->tree != NULL)
+		BUG();
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity);
+	else
+		bfq_idle_insert(st, entity);
+
+	BUG_ON(sd->active_entity == entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+void bfq_deactivate_entity(struct io_entity *entity, int requeue)
+{
+	__bfq_deactivate_entity(entity, requeue);
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated tasks getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct io_service_tree *st)
+{
+	struct io_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct io_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		st->vtime = entry->min_start;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active - find the eligible entity with the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start <= vtime) entity.  The path
+ * on the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct io_entity *bfq_first_active_entity(struct io_service_tree *st)
+{
+	struct io_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node != NULL) {
+		entry = rb_entry(node, struct io_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+		if (node->rb_left != NULL) {
+			entry = rb_entry(node->rb_left,
+					 struct io_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first != NULL)
+			break;
+		node = node->rb_right;
+	}
+
+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct io_entity *__bfq_lookup_next_entity(struct io_service_tree *st)
+{
+	struct io_entity *entity;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+	BUG_ON(bfq_gt(entity->start, st->vtime));
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_active entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_active value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
+						 int extract)
+{
+	struct io_service_tree *st = sd->service_tree;
+	struct io_entity *entity;
+	int i;
+
+	/*
+	 * One can check for which will be next selected entity without
+	 * expiring the current one.
+	 */
+	BUG_ON(extract && sd->active_entity != NULL);
+
+	for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) {
+		entity = __bfq_lookup_next_entity(st);
+		if (entity != NULL) {
+			if (extract) {
+				bfq_active_extract(st, entity);
+				sd->active_entity = entity;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+void entity_served(struct io_entity *entity, bfq_service_t served)
+{
+	struct io_service_tree *st;
+
+	st = io_entity_service_tree(entity);
+	entity->service += served;
+	WARN_ON_ONCE(entity->service > entity->budget);
+	BUG_ON(st->wsum == 0);
+	st->vtime += bfq_delta(served, st->wsum);
+	bfq_forget_idle(st);
+}
+
+/* Elevator fair queuing function */
+struct io_queue *rq_ioq(struct request *rq)
+{
+	return rq->ioq;
+}
+
+static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
+{
+	return e->efqd.active_queue;
+}
+
+void *elv_active_sched_queue(struct elevator_queue *e)
+{
+	return ioq_sched_queue(elv_active_ioq(e));
+}
+EXPORT_SYMBOL(elv_active_sched_queue);
+
+int elv_nr_busy_ioq(struct elevator_queue *e)
+{
+	return e->efqd.busy_queues;
+}
+EXPORT_SYMBOL(elv_nr_busy_ioq);
+
+int elv_nr_busy_rt_ioq(struct elevator_queue *e)
+{
+	return e->efqd.busy_rt_queues;
+}
+EXPORT_SYMBOL(elv_nr_busy_rt_ioq);
+
+int elv_hw_tag(struct elevator_queue *e)
+{
+	return e->efqd.hw_tag;
+}
+EXPORT_SYMBOL(elv_hw_tag);
+
+/* Helper functions for operating on elevator idle slice timer */
+int elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires)
+{
+	struct elv_fq_data *efqd = &eq->efqd;
+
+	return mod_timer(&efqd->idle_slice_timer, expires);
+}
+EXPORT_SYMBOL(elv_mod_idle_slice_timer);
+
+int elv_del_idle_slice_timer(struct elevator_queue *eq)
+{
+	struct elv_fq_data *efqd = &eq->efqd;
+
+	return del_timer(&efqd->idle_slice_timer);
+}
+EXPORT_SYMBOL(elv_del_idle_slice_timer);
+
+unsigned int elv_get_slice_idle(struct elevator_queue *eq)
+{
+	return eq->efqd.elv_slice_idle;
+}
+EXPORT_SYMBOL(elv_get_slice_idle);
+
+void elv_ioq_served(struct io_queue *ioq, bfq_service_t served)
+{
+	struct elv_fq_data *efqd = ioq->efqd;
+
+	entity_served(&ioq->entity, served);
+
+	ioq->total_service += served;
+	elv_log_ioq(efqd, ioq, "ioq served=0x%lx total service=0x%lx", served,
+			ioq->total_service);
+}
+
+/* Functions to show and store elv_idle_slice value through sysfs */
+ssize_t elv_slice_idle_show(struct request_queue *q, char *name)
+{
+	struct elv_fq_data *efqd;
+	unsigned int data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	efqd = &q->elevator->efqd;
+	data = jiffies_to_msecs(efqd->elv_slice_idle);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+	return sprintf(name, "%d\n", data);
+}
+
+ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
+			  size_t count)
+{
+	struct elv_fq_data *efqd;
+	unsigned int data;
+	unsigned long flags;
+
+	char *p = (char *)name;
+
+	data = simple_strtoul(p, &p, 10);
+
+	if (data < 0)
+		data = 0;
+	else if (data > INT_MAX)
+		data = INT_MAX;
+
+	data = msecs_to_jiffies(data);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	efqd = &q->elevator->efqd;
+	efqd->elv_slice_idle = data;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return count;
+}
+
+void elv_schedule_dispatch(struct request_queue *q)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	if (elv_nr_busy_ioq(q->elevator)) {
+		elv_log(efqd, "schedule dispatch");
+		kblockd_schedule_work(efqd->queue, &efqd->unplug_work);
+	}
+}
+EXPORT_SYMBOL(elv_schedule_dispatch);
+
+void elv_kick_queue(struct work_struct *work)
+{
+	struct elv_fq_data *efqd =
+		container_of(work, struct elv_fq_data, unplug_work);
+	struct request_queue *q = efqd->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_start_queueing(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void elv_shutdown_timer_wq(struct elevator_queue *e)
+{
+	del_timer_sync(&e->efqd.idle_slice_timer);
+	cancel_work_sync(&e->efqd.unplug_work);
+}
+EXPORT_SYMBOL(elv_shutdown_timer_wq);
+
+void elv_ioq_set_prio_slice(struct request_queue *q, struct io_queue *ioq)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	ioq->slice_end = jiffies + ioq->entity.budget;
+	elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->entity.budget);
+}
+
+void elv_ioq_init_prio_data(struct io_queue *ioq, int ioprio_class, int ioprio)
+{
+	struct io_entity *entity = &ioq->entity;
+
+	entity->new_ioprio_class = ioprio_class;
+	entity->new_ioprio = ioprio;
+	entity->ioprio_changed = 1;
+	return;
+}
+
+static void elv_ioq_update_io_thinktime(struct io_queue *ioq)
+{
+	struct elv_fq_data *efqd = ioq->efqd;
+	unsigned long elapsed = jiffies - ioq->last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * efqd->elv_slice_idle);
+
+	ioq->ttime_samples = (7*ioq->ttime_samples + 256) / 8;
+	ioq->ttime_total = (7*ioq->ttime_total + 256*ttime) / 8;
+	ioq->ttime_mean = (ioq->ttime_total + 128) / ioq->ttime_samples;
+}
+
+/*
+ * Disable idle window if the process thinks too long.
+ * This idle flag can also be updated by io scheduler.
+ */
+static void elv_ioq_update_idle_window(struct elevator_queue *eq,
+				struct io_queue *ioq, struct request *rq)
+{
+	int old_idle, enable_idle;
+	struct elv_fq_data *efqd = ioq->efqd;
+
+	/*
+	 * Don't idle for async or idle io prio class
+	 */
+	if (!elv_ioq_sync(ioq) || elv_ioq_class_idle(ioq))
+		return;
+
+	enable_idle = old_idle = elv_ioq_idle_window(ioq);
+
+	if (!efqd->elv_slice_idle)
+		enable_idle = 0;
+	else if (ioq_sample_valid(ioq->ttime_samples)) {
+		if (ioq->ttime_mean > efqd->elv_slice_idle)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+
+	/*
+	 * From think time perspective idle should be enabled. Check with
+	 * io scheduler if it wants to disable idling based on additional
+	 * considrations like seek pattern.
+	 */
+	if (enable_idle) {
+		if (eq->ops->elevator_update_idle_window_fn)
+			enable_idle = eq->ops->elevator_update_idle_window_fn(
+						eq, ioq->sched_queue, rq);
+		if (!enable_idle)
+			elv_log_ioq(efqd, ioq, "iosched disabled idle");
+	}
+
+	if (old_idle != enable_idle) {
+		elv_log_ioq(efqd, ioq, "idle=%d", enable_idle);
+		if (enable_idle)
+			elv_mark_ioq_idle_window(ioq);
+		else
+			elv_clear_ioq_idle_window(ioq);
+	}
+}
+
+struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask)
+{
+	struct io_queue *ioq = NULL;
+
+	ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node);
+	return ioq;
+}
+EXPORT_SYMBOL(elv_alloc_ioq);
+
+void elv_free_ioq(struct io_queue *ioq)
+{
+	kmem_cache_free(elv_ioq_pool, ioq);
+}
+EXPORT_SYMBOL(elv_free_ioq);
+
+int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
+			void *sched_queue, int ioprio_class, int ioprio,
+			int is_sync)
+{
+	struct elv_fq_data *efqd = &eq->efqd;
+	struct io_group *iog = io_lookup_io_group_current(efqd->queue);
+
+	RB_CLEAR_NODE(&ioq->entity.rb_node);
+	atomic_set(&ioq->ref, 0);
+	ioq->efqd = efqd;
+	ioq->entity.budget = efqd->elv_slice[is_sync];
+	elv_ioq_set_ioprio_class(ioq, ioprio_class);
+	elv_ioq_set_ioprio(ioq, ioprio);
+	ioq->pid = current->pid;
+	ioq->sched_queue = sched_queue;
+	elv_mark_ioq_idle_window(ioq);
+	bfq_init_entity(&ioq->entity, iog);
+	return 0;
+}
+EXPORT_SYMBOL(elv_init_ioq);
+
+void elv_put_ioq(struct io_queue *ioq)
+{
+	struct elv_fq_data *efqd = ioq->efqd;
+	struct elevator_queue *e = container_of(efqd, struct elevator_queue,
+						efqd);
+
+	BUG_ON(atomic_read(&ioq->ref) <= 0);
+	if (!atomic_dec_and_test(&ioq->ref))
+		return;
+	BUG_ON(ioq->nr_queued);
+	BUG_ON(ioq->entity.tree != NULL);
+	BUG_ON(elv_ioq_busy(ioq));
+	BUG_ON(efqd->active_queue == ioq);
+
+	/* Can be called by outgoing elevator. Don't use q */
+	BUG_ON(!e->ops->elevator_free_sched_queue_fn);
+
+	e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
+	elv_log_ioq(efqd, ioq, "freed");
+	elv_free_ioq(ioq);
+}
+EXPORT_SYMBOL(elv_put_ioq);
+
+void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
+{
+	struct io_queue *ioq = *ioq_ptr;
+
+	if (ioq != NULL) {
+		/* Drop the reference taken by the io group */
+		elv_put_ioq(ioq);
+		*ioq_ptr = NULL;
+	}
+}
+
+/* Get next queue for service. */
+struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+	struct io_entity *entity = NULL;
+	struct io_queue *ioq = NULL;
+	struct io_sched_data *sd;
+
+	/*
+	 * one can check for which queue will be selected next while having
+	 * one queue active. preempt logic uses it.
+	 */
+	BUG_ON(extract && efqd->active_queue != NULL);
+
+	if (!efqd->busy_queues)
+		return NULL;
+
+	sd = &efqd->root_group->sched_data;
+	if (extract)
+		entity = bfq_lookup_next_entity(sd, 1);
+	else
+		entity = bfq_lookup_next_entity(sd, 0);
+
+	BUG_ON(!entity);
+	if (extract)
+		entity->service = 0;
+	ioq = io_entity_to_ioq(entity);
+
+	return ioq;
+}
+
+static void __elv_set_active_ioq(struct elv_fq_data *efqd,
+					struct io_queue *ioq)
+{
+	struct request_queue *q = efqd->queue;
+
+	if (ioq) {
+		elv_log_ioq(efqd, ioq, "set_active, busy_queues=%d",
+							efqd->busy_queues);
+		ioq->slice_end = 0;
+		elv_mark_ioq_slice_new(ioq);
+	}
+
+	efqd->active_queue = ioq;
+
+	/* Let iosched know if it wants to take some action */
+	if (ioq) {
+		if (q->elevator->ops->elevator_active_ioq_set_fn)
+			q->elevator->ops->elevator_active_ioq_set_fn(q,
+							ioq->sched_queue);
+	}
+}
+
+/* Get and set a new active queue for service. */
+struct io_queue *elv_set_active_ioq(struct request_queue *q)
+{
+	struct io_queue *ioq;
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	ioq = elv_get_next_ioq(q, 1);
+	__elv_set_active_ioq(efqd, ioq);
+	return ioq;
+}
+
+void elv_reset_active_ioq(struct elv_fq_data *efqd)
+{
+	struct request_queue *q = efqd->queue;
+
+	if (q->elevator->ops->elevator_active_ioq_reset_fn)
+		q->elevator->ops->elevator_active_ioq_reset_fn(q);
+	efqd->active_queue = NULL;
+	del_timer(&efqd->idle_slice_timer);
+}
+
+void elv_activate_ioq(struct io_queue *ioq)
+{
+	bfq_activate_entity(&ioq->entity);
+}
+
+void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
+					int requeue)
+{
+	if (ioq == efqd->active_queue)
+		elv_reset_active_ioq(efqd);
+
+	bfq_deactivate_entity(&ioq->entity, requeue);
+}
+
+/* Called when an inactive queue receives a new request. */
+void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq)
+{
+	BUG_ON(elv_ioq_busy(ioq));
+	BUG_ON(ioq == efqd->active_queue);
+	elv_log_ioq(efqd, ioq, "add to busy");
+	elv_activate_ioq(ioq);
+	elv_mark_ioq_busy(ioq);
+	efqd->busy_queues++;
+	if (elv_ioq_class_rt(ioq))
+		efqd->busy_rt_queues++;
+}
+
+void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq,
+					int requeue)
+{
+	struct elv_fq_data *efqd = &e->efqd;
+
+	BUG_ON(!elv_ioq_busy(ioq));
+	BUG_ON(ioq->nr_queued);
+	elv_log_ioq(efqd, ioq, "del from busy");
+	elv_clear_ioq_busy(ioq);
+	BUG_ON(efqd->busy_queues == 0);
+	efqd->busy_queues--;
+	if (elv_ioq_class_rt(ioq))
+		efqd->busy_rt_queues--;
+
+	elv_deactivate_ioq(efqd, ioq, requeue);
+}
+
+/*
+ * Do the accounting. Determine how much service (in terms of time slices)
+ * current queue used and adjust the start, finish time of queue and vtime
+ * of the tree accordingly.
+ *
+ * Determining the service used in terms of time is tricky in certain
+ * situations. Especially when underlying device supports command queuing
+ * and requests from multiple queues can be there at same time, then it
+ * is not clear which queue consumed how much of disk time.
+ *
+ * To mitigate this problem, cfq starts the time slice of the queue only
+ * after first request from the queue has completed. This does not work
+ * very well if we expire the queue before we wait for first and more
+ * request to finish from the queue. For seeky queues, we will expire the
+ * queue after dispatching few requests without waiting and start dispatching
+ * from next queue.
+ *
+ * Not sure how to determine the time consumed by queue in such scenarios.
+ * Currently as a crude approximation, we are charging 25% of time slice
+ * for such cases. A better mechanism is needed for accurate accounting. 
+ */
+void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq,
+				int budget_update)
+{
+	struct elevator_queue *e = q->elevator;
+	struct elv_fq_data *efqd = &e->efqd;
+	struct io_entity *entity = &ioq->entity;
+	unsigned long slice, step, min_slice;
+	long slice_unused, slice_used;
+
+	assert_spin_locked(q->queue_lock);
+	elv_log_ioq(efqd, ioq, "slice expired t=%d", budget_update);
+
+	if (elv_ioq_wait_request(ioq))
+		del_timer(&efqd->idle_slice_timer);
+
+	elv_clear_ioq_wait_request(ioq);
+
+	/*
+	 * if ioq->slice_end = 0, that means a queue was expired before first
+	 * reuqest from the queue got completed. Of course we are not planning
+	 * to idle on the queue otherwise we would not have expired it.
+	 *
+	 * Charge for the 25% slice in such cases. This is not the best thing
+	 * to do but at the same time not very sure what's the next best
+	 * thing to do.
+	 *
+	 * This arises from that fact that we don't have the notion of
+	 * one queue being operational at one time. io scheduler can dispatch
+	 * requests from multiple queues in one dispatch round. Ideally for
+	 * more accurate accounting of exact disk time used by disk, one
+	 * should dispatch requests from only one queue and wait for all
+	 * the requests to finish. But this will reduce throughput.
+	 */
+	if (!ioq->slice_end)
+		slice_unused = 3*entity->budget/4;
+	else {
+		slice_unused = ioq->slice_end - jiffies;
+		/*
+		 * queue consumed more slice than it was allocated for.
+		 * Currently we are still charging it only for allocated
+		 * slice and not for the consumed slice as it can increase
+		 * the latency of when this queue is scheduled next.
+		 *
+		 * Maybe we can live with little bit unfairness. This is
+		 * still an open problem regarding how to handle it
+		 * correctly.
+		 */
+		if (slice_unused < 0)
+			slice_unused = 0;
+
+		if (slice_unused == entity->budget) {
+			/*
+			 * queue got expired immediately after completing
+			 * first request. Charge 25% of slice.
+			 */
+			slice_unused = (3*entity->budget)/4;
+		}
+
+	}
+
+	slice_used = entity->budget - slice_unused;
+	elv_ioq_served(ioq, slice_used);
+
+	if (budget_update && !elv_ioq_slice_new(ioq)) {
+		slice = efqd->elv_slice[elv_ioq_sync(ioq)];
+		step = slice / 16;
+		min_slice = slice - slice / 4;
+
+		/*
+		 * Try to adapt the slice length to the behavior of the
+		 * queue.  If it has not exhausted the assigned budget
+		 * assign it a shorter new one, otherwise assign it a
+		 * longer new one.  The increments/decrements are done
+		 * linearly with a step of cfqd->cfq_slice / 16, and
+		 * slices of less than 11 / 16 * cfqd->cfq_slice are
+		 * never assigned, to avoid performance degradation.
+		 */
+		if (slice_unused != 0 && entity->budget >= min_slice)
+			entity->budget -= step;
+		else if (slice_unused == 0 && entity->budget <= slice - step)
+			entity->budget += step;
+
+		elv_log_ioq(efqd, ioq, "slice_unused=%ld, budget=%ld",
+					slice_unused, entity->budget);
+	}
+
+	BUG_ON(ioq != efqd->active_queue);
+	elv_reset_active_ioq(efqd);
+
+	if (!ioq->nr_queued)
+		elv_del_ioq_busy(e, ioq, 1);
+	else
+		elv_activate_ioq(ioq);
+}
+EXPORT_SYMBOL(__elv_ioq_slice_expired);
+
+/*
+ * budget_update signifies whether if budget increment/decrement accounting
+ * should be done or not on this queue on this expiry.
+ * In some circumstances like preemption, forced dispatch etc, it might
+ * not make much sense to adjust budgets.
+ */
+void elv_ioq_slice_expired(struct request_queue *q, int budget_update)
+{
+	struct io_queue *ioq = elv_active_ioq(q->elevator);
+
+	if (ioq)
+		__elv_ioq_slice_expired(q, ioq, budget_update);
+}
+
+/*
+ * Check if new_cfqq should preempt the currently active queue. Return 0 for
+ * no or if we aren't sure, a 1 will cause a preemption attempt.
+ */
+int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
+			struct request *rq)
+{
+	struct io_queue *ioq;
+	struct elevator_queue *eq = q->elevator;
+
+	ioq = elv_active_ioq(eq);
+
+	if (!ioq)
+		return 0;
+
+	if (elv_ioq_slice_used(ioq))
+		return 1;
+
+	if (elv_ioq_class_idle(new_ioq))
+		return 0;
+
+	if (elv_ioq_class_idle(ioq))
+		return 1;
+
+	/*
+	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+	 */
+	if (elv_ioq_class_rt(new_ioq) && !elv_ioq_class_rt(ioq))
+		return 1;
+
+	/*
+	 * Check with io scheduler if it has additional criterion based on
+	 * which it wants to preempt existing queue.
+	 */
+	if (eq->ops->elevator_should_preempt_fn)
+		return eq->ops->elevator_should_preempt_fn(q, new_ioq, rq);
+
+	return 0;
+}
+
+int elv_preempt_queue(struct request_queue *q, struct io_queue *ioq)
+{
+	struct io_queue *new_ioq;
+
+	elv_log_ioq(&q->elevator->efqd, ioq, "preemption attempt");
+
+	new_ioq = elv_get_next_ioq(q, 0);
+	if (new_ioq == ioq) {
+		/*
+		 * We might need expire_ioq logic here to check with io
+		 * scheduler if queue can be preempted. This might not
+		 * be need for cfq but AS might need it.
+		 */
+		elv_ioq_slice_expired(q, 0);
+		elv_ioq_set_slice_end(ioq, 0);
+		elv_mark_ioq_slice_new(ioq);
+		return 1;
+	}
+
+	return 0;
+}
+
+void elv_ioq_request_add(struct request_queue *q, struct request *rq)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+	struct io_queue *ioq = rq->ioq;
+
+	if (!elv_iosched_fair_queuing_enabled(q->elevator))
+		return;
+
+	BUG_ON(!efqd);
+	BUG_ON(!ioq);
+	efqd->rq_queued++;
+	ioq->nr_queued++;
+
+	if (!elv_ioq_busy(ioq))
+		elv_add_ioq_busy(efqd, ioq);
+
+	elv_ioq_update_io_thinktime(ioq);
+	elv_ioq_update_idle_window(q->elevator, ioq, rq);
+
+	if (ioq == elv_active_ioq(q->elevator)) {
+		/*
+		 * if we are waiting for a request for this queue, let it rip
+		 * immediately and flag that we must not expire this queue
+		 * just now
+		 */
+		if (elv_ioq_wait_request(ioq)) {
+			del_timer(&efqd->idle_slice_timer);
+			blk_start_queueing(q);
+		}
+	} else if (elv_should_preempt(q, ioq, rq)) {
+		/*
+		 * not the active queue - expire current slice if it is
+		 * idle and has expired it's mean thinktime or this new queue
+		 * has some old slice time left and is of higher priority or
+		 * this new queue is RT and the current one is BE
+		 */
+		/*
+		 * try to preempt the active queue; we still respect the
+		 * scheduler decision, so we try to reschedule, but if cfqq
+		 * has received more service than allocated, the scheduler
+		 * will refuse the preemption.
+		 */
+		if (elv_preempt_queue(q, ioq))
+			blk_start_queueing(q);
+	}
+}
+
+void elv_idle_slice_timer(unsigned long data)
+{
+	struct elv_fq_data *efqd = (struct elv_fq_data *)data;
+	struct io_queue *ioq;
+	unsigned long flags;
+	struct request_queue *q = efqd->queue;
+
+	elv_log(efqd, "idle timer fired");
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	ioq = efqd->active_queue;
+
+	if (ioq) {
+
+		/*
+		 * expired
+		 */
+		if (elv_ioq_slice_used(ioq))
+			goto expire;
+
+		/*
+		 * only expire and reinvoke request handler, if there are
+		 * other queues with pending requests
+		 */
+		if (!elv_nr_busy_ioq(q->elevator))
+			goto out_cont;
+
+		/*
+		 * not expired and it has a request pending, let it dispatch
+		 */
+		if (ioq->nr_queued)
+			goto out_kick;
+	}
+expire:
+	elv_ioq_slice_expired(q, 1);
+out_kick:
+	elv_schedule_dispatch(q);
+out_cont:
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void elv_ioq_arm_slice_timer(struct request_queue *q)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+	struct io_queue *ioq = elv_active_ioq(q->elevator);
+	unsigned long sl;
+
+	BUG_ON(!ioq);
+
+	/*
+	 * SSD device without seek penalty, disable idling. But only do so
+	 * for devices that support queuing, otherwise we still have a problem
+	 * with sync vs async workloads.
+	 */
+	if (blk_queue_nonrot(q) && efqd->hw_tag)
+		return;
+
+	/*
+	 * still requests with the driver, don't idle
+	 */
+	if (efqd->rq_in_driver)
+		return;
+
+	/*
+	 * idle is disabled, either manually or by past process history
+	 */
+	if (!efqd->elv_slice_idle || !elv_ioq_idle_window(ioq))
+		return;
+
+	/*
+	 * may be iosched got its own idling logic. In that case io
+	 * schduler will take care of arming the timer, if need be.
+	 */
+	if (q->elevator->ops->elevator_arm_slice_timer_fn) {
+		q->elevator->ops->elevator_arm_slice_timer_fn(q,
+						ioq->sched_queue);
+	} else {
+		elv_mark_ioq_wait_request(ioq);
+		sl = efqd->elv_slice_idle;
+		mod_timer(&efqd->idle_slice_timer, jiffies + sl);
+		elv_log(efqd, "arm idle: %lu", sl);
+	}
+}
+
+void elv_free_idle_ioq_list(struct elevator_queue *e)
+{
+	struct io_queue *ioq, *n;
+	struct elv_fq_data *efqd = &e->efqd;
+
+	list_for_each_entry_safe(ioq, n, &efqd->idle_list, queue_list)
+		elv_deactivate_ioq(efqd, ioq, 0);
+}
+
+/* Common layer function to select the next queue to dispatch from */
+void *elv_fq_select_ioq(struct request_queue *q, int force)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+	struct io_queue *ioq = elv_active_ioq(q->elevator);
+	int budget_update = 1;
+
+	if (!elv_nr_busy_ioq(q->elevator))
+		return NULL;
+
+	if (ioq == NULL)
+		goto new_queue;
+
+	/*
+	 * Force dispatch. Continue to dispatch from current queue as long
+	 * as it has requests.
+	 */
+	if (unlikely(force)) {
+		if (ioq->nr_queued)
+			goto keep_queue;
+		else {
+			/*
+			 * Don't try to update queue's budget based on forced
+			 * dispatch bahavior.
+			 */
+			budget_update = 0;
+			goto expire;
+		}
+	}
+
+	/*
+	 * The active queue has run out of time, expire it and select new.
+	 */
+	if (elv_ioq_slice_used(ioq))
+		goto expire;
+
+	/*
+	 * If we have a RT cfqq waiting, then we pre-empt the current non-rt
+	 * cfqq.
+	 */
+	if (!elv_ioq_class_rt(ioq) && efqd->busy_rt_queues) {
+		/*
+		 * We simulate this as cfqq timed out so that it gets to bank
+		 * the remaining of its time slice.
+		 */
+		elv_log_ioq(efqd, ioq, "preempt");
+
+		/* Don't do budget adjustments for queue being preempted. */
+		budget_update = 0;
+		goto expire;
+	}
+
+	/*
+	 * The active queue has requests and isn't expired, allow it to
+	 * dispatch.
+	 */
+
+	if (ioq->nr_queued)
+		goto keep_queue;
+
+	/*
+	 * No requests pending. If the active queue still has requests in
+	 * flight or is idling for a new request, allow either of these
+	 * conditions to happen (or time out) before selecting a new queue.
+	 */
+
+	if (timer_pending(&efqd->idle_slice_timer) ||
+	    (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) {
+		ioq = NULL;
+		goto keep_queue;
+	}
+
+expire:
+	elv_ioq_slice_expired(q, budget_update);
+new_queue:
+	ioq = elv_set_active_ioq(q);
+keep_queue:
+	return ioq;
+}
+
+/* A request got removed from io_queue. Do the accounting */
+void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq)
+{
+	struct io_queue *ioq;
+	struct elv_fq_data *efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(e))
+		return;
+
+	ioq = rq->ioq;
+	BUG_ON(!ioq);
+	ioq->nr_queued--;
+
+	efqd = ioq->efqd;
+	BUG_ON(!efqd);
+	efqd->rq_queued--;
+
+	if (elv_ioq_busy(ioq) && (elv_active_ioq(e) != ioq) && !ioq->nr_queued)
+		elv_del_ioq_busy(e, ioq, 1);
+}
+
+/* A request got dispatched. Do the accounting. */
+void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq)
+{
+	struct io_queue *ioq = rq->ioq;
+
+	if (!elv_iosched_fair_queuing_enabled(e))
+		return;
+
+	BUG_ON(!ioq);
+	elv_ioq_request_dispatched(ioq);
+	elv_ioq_request_removed(e, rq);
+}
+
+void elv_fq_activate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(q->elevator))
+		return;
+
+	efqd->rq_in_driver++;
+	elv_log_ioq(efqd, rq_ioq(rq), "activate rq, drv=%d",
+						efqd->rq_in_driver);
+}
+
+void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(q->elevator))
+		return;
+
+	WARN_ON(!efqd->rq_in_driver);
+	efqd->rq_in_driver--;
+	elv_log_ioq(efqd, rq_ioq(rq), "deactivate rq, drv=%d",
+						efqd->rq_in_driver);
+}
+
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void elv_update_hw_tag(struct elv_fq_data *efqd)
+{
+	if (efqd->rq_in_driver > efqd->rq_in_driver_peak)
+		efqd->rq_in_driver_peak = efqd->rq_in_driver;
+
+	if (efqd->rq_queued <= ELV_HW_QUEUE_MIN &&
+	    efqd->rq_in_driver <= ELV_HW_QUEUE_MIN)
+		return;
+
+	if (efqd->hw_tag_samples++ < 50)
+		return;
+
+	if (efqd->rq_in_driver_peak >= ELV_HW_QUEUE_MIN)
+		efqd->hw_tag = 1;
+	else
+		efqd->hw_tag = 0;
+
+	efqd->hw_tag_samples = 0;
+	efqd->rq_in_driver_peak = 0;
+}
+
+/* A request got completed from io_queue. Do the accounting. */
+void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
+{
+	const int sync = rq_is_sync(rq);
+	struct io_queue *ioq = rq->ioq;
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(q->elevator))
+		return;
+
+	elv_log_ioq(efqd, ioq, "complete");
+
+	elv_update_hw_tag(efqd);
+
+	WARN_ON(!efqd->rq_in_driver);
+	efqd->rq_in_driver--;
+
+	WARN_ON(!ioq->dispatched);
+	ioq->dispatched--;
+
+	if (sync)
+		ioq->last_end_request = jiffies;
+
+	/*
+	 * If this is the active queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+
+	if (elv_active_ioq(q->elevator) == ioq) {
+		if (elv_ioq_slice_new(ioq)) {
+			elv_ioq_set_prio_slice(q, ioq);
+			elv_clear_ioq_slice_new(ioq);
+		}
+		if (elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq))
+			elv_ioq_slice_expired(q, 1);
+		else if (sync && !ioq->nr_queued)
+			elv_ioq_arm_slice_timer(q);
+	}
+
+	if (!efqd->rq_in_driver)
+		elv_schedule_dispatch(q);
+}
+
+struct io_group *io_lookup_io_group_current(struct request_queue *q)
+{
+	struct elv_fq_data *efqd = &q->elevator->efqd;
+
+	return efqd->root_group;
+}
+EXPORT_SYMBOL(io_lookup_io_group_current);
+
+void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
+					int ioprio)
+{
+	struct io_queue *ioq = NULL;
+
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		ioq = iog->async_queue[0][ioprio];
+		break;
+	case IOPRIO_CLASS_BE:
+		ioq = iog->async_queue[1][ioprio];
+		break;
+	case IOPRIO_CLASS_IDLE:
+		ioq = iog->async_idle_queue;
+		break;
+	default:
+		BUG();
+	}
+
+	if (ioq)
+		return ioq->sched_queue;
+	return NULL;
+}
+EXPORT_SYMBOL(io_group_async_queue_prio);
+
+void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
+					int ioprio, struct io_queue *ioq)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		iog->async_queue[0][ioprio] = ioq;
+		break;
+	case IOPRIO_CLASS_BE:
+		iog->async_queue[1][ioprio] = ioq;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		iog->async_idle_queue = ioq;
+		break;
+	default:
+		BUG();
+	}
+
+	/*
+	 * Take the group reference and pin the queue. Group exit will
+	 * clean it up
+	 */
+	elv_get_ioq(ioq);
+}
+EXPORT_SYMBOL(io_group_set_async_queue);
+
+/*
+ * Release all the io group references to its async queues.
+ */
+void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			elv_release_ioq(e, &iog->async_queue[i][j]);
+
+	/* Free up async idle queue */
+	elv_release_ioq(e, &iog->async_idle_queue);
+}
+
+struct io_group *io_alloc_root_group(struct request_queue *q,
+					struct elevator_queue *e, void *key)
+{
+	struct io_group *iog;
+	int i;
+
+	iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+	if (iog == NULL)
+		return NULL;
+
+	for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+		iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+	return iog;
+}
+
+void io_free_root_group(struct elevator_queue *e)
+{
+	struct io_group *iog = e->efqd.root_group;
+	io_put_io_group_queues(e, iog);
+	kfree(iog);
+}
+
+static void elv_slab_kill(void)
+{
+	/*
+	 * Caller already ensured that pending RCU callbacks are completed,
+	 * so we should have no busy allocations at this point.
+	 */
+	if (elv_ioq_pool)
+		kmem_cache_destroy(elv_ioq_pool);
+}
+
+static int __init elv_slab_setup(void)
+{
+	elv_ioq_pool = KMEM_CACHE(io_queue, 0);
+	if (!elv_ioq_pool)
+		goto fail;
+
+	return 0;
+fail:
+	elv_slab_kill();
+	return -ENOMEM;
+}
+
+/* Initialize fair queueing data associated with elevator */
+int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
+{
+	struct io_group *iog;
+	struct elv_fq_data *efqd = &e->efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(e))
+		return 0;
+
+	iog = io_alloc_root_group(q, e, efqd);
+	if (iog == NULL)
+		return 1;
+
+	efqd->root_group = iog;
+	efqd->queue = q;
+
+	init_timer(&efqd->idle_slice_timer);
+	efqd->idle_slice_timer.function = elv_idle_slice_timer;
+	efqd->idle_slice_timer.data = (unsigned long) efqd;
+
+	INIT_WORK(&efqd->unplug_work, elv_kick_queue);
+
+	INIT_LIST_HEAD(&efqd->idle_list);
+
+	efqd->elv_slice[0] = elv_slice_async;
+	efqd->elv_slice[1] = elv_slice_sync;
+	efqd->elv_slice_idle = elv_slice_idle;
+	efqd->hw_tag = 1;
+
+	return 0;
+}
+
+/*
+ * elv_exit_fq_data is called before we call elevator_exit_fn. Before
+ * we ask elevator to cleanup its queues, we do the cleanup here so
+ * that all the group and idle tree references to ioq are dropped. Later
+ * during elevator cleanup, ioc reference will be dropped which will lead
+ * to removal of ioscheduler queue as well as associated ioq object.
+ */
+void elv_exit_fq_data(struct elevator_queue *e)
+{
+	struct elv_fq_data *efqd = &e->efqd;
+	struct request_queue *q = efqd->queue;
+
+	if (!elv_iosched_fair_queuing_enabled(e))
+		return;
+
+	elv_shutdown_timer_wq(e);
+
+	spin_lock_irq(q->queue_lock);
+	/* This should drop all the idle tree references of ioq */
+	elv_free_idle_ioq_list(e);
+	spin_unlock_irq(q->queue_lock);
+
+	elv_shutdown_timer_wq(e);
+
+	BUG_ON(timer_pending(&efqd->idle_slice_timer));
+	io_free_root_group(e);
+}
+
+/*
+ * This is called after the io scheduler has cleaned up its data structres.
+ * I don't think that this function is required. Right now just keeping it
+ * because cfq cleans up timer and work queue again after freeing up
+ * io contexts. To me io scheduler has already been drained out, and all
+ * the active queue have already been expired so time and work queue should
+ * not been activated during cleanup process.
+ *
+ * Keeping it here for the time being. Will get rid of it later.
+ */
+void elv_exit_fq_data_post(struct elevator_queue *e)
+{
+	struct elv_fq_data *efqd = &e->efqd;
+
+	if (!elv_iosched_fair_queuing_enabled(e))
+		return;
+
+	elv_shutdown_timer_wq(e);
+	BUG_ON(timer_pending(&efqd->idle_slice_timer));
+}
+
+
+static int __init elv_fq_init(void)
+{
+	if (elv_slab_setup())
+		return -ENOMEM;
+
+	/* could be 0 on HZ < 1000 setups */
+
+	if (!elv_slice_async)
+		elv_slice_async = 1;
+
+	if (elv_slice_idle == 0)
+		elv_slice_idle = 1;
+
+	return 0;
+}
+
+module_init(elv_fq_init);
diff --git a/block/elevator-fq.h b/block/elevator-fq.h
new file mode 100644
index 0000000..b5a0d08
--- /dev/null
+++ b/block/elevator-fq.h
@@ -0,0 +1,479 @@
+/*
+ * BFQ: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#include <linux/blkdev.h>
+
+#ifndef _BFQ_SCHED_H
+#define _BFQ_SCHED_H
+
+#define IO_IOPRIO_CLASSES	3
+
+typedef u64 bfq_timestamp_t;
+typedef unsigned long bfq_weight_t;
+typedef unsigned long bfq_service_t;
+struct io_entity;
+struct io_queue;
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ * @active: tree for active entities (i.e., those backlogged).
+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
+ * @first_idle: idle entity with minimum F_i.
+ * @last_idle: idle entity with maximum F_i.
+ * @vtime: scheduler virtual time.
+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing efqd.
+ */
+struct io_service_tree {
+	struct rb_root active;
+	struct rb_root idle;
+
+	struct io_entity *first_idle;
+	struct io_entity *last_idle;
+
+	bfq_timestamp_t vtime;
+	bfq_weight_t wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ * @active_entity: entity under service.
+ * @next_active: head-of-the-line entity in the scheduler.
+ * @service_tree: array of service trees, one per ioprio_class.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_active points to the active entity of the sched_data service
+ * trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct io_sched_data {
+	struct io_entity *active_entity;
+	struct io_service_tree service_tree[IO_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ * @rb_node: service_tree member.
+ * @on_st: flag, true if the entity is on a tree (either the active or
+ *         the idle one of its service_tree).
+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
+ * @start: B-WF2Q+ start timestamp (aka S_i).
+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
+ * @min_start: minimum start time of the (active) subtree rooted at
+ *             this entity; used for O(log N) lookups into active trees.
+ * @service: service received during the last round of service.
+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
+ * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
+ * @parent: parent entity, for hierarchical scheduling.
+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
+ *                 associated scheduler queue, %NULL on leaf nodes.
+ * @sched_data: the scheduler queue this entity belongs to.
+ * @ioprio: the ioprio in use.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested an ioprio or
+ *                  ioprio_class change.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would allow
+ * different weights on different devices, but this functionality is not
+ * exported to userspace by now.  Priorities are updated lazily, first
+ * storing the new values into the new_* fields, then setting the
+ * @ioprio_changed flag.  As soon as there is a transition in the entity
+ * state that allows the priority update to take place the effective and
+ * the requested priority values are synchronized.
+ *
+ * The weight value is calculated from the ioprio to export the same
+ * interface as CFQ.  When dealing with ``well-behaved'' queues (i.e.,
+ * queues that do not spend too much time to consume their budget and
+ * have true sequential behavior, and when there are no external factors
+ * breaking anticipation) the relative weights at each level of the
+ * cgroups hierarchy should be guaranteed.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct io_entity {
+	struct rb_node rb_node;
+
+	int on_st;
+
+	bfq_timestamp_t finish;
+	bfq_timestamp_t start;
+
+	struct rb_root *tree;
+
+	bfq_timestamp_t min_start;
+
+	bfq_service_t service, budget;
+	bfq_weight_t weight;
+
+	struct io_entity *parent;
+
+	struct io_sched_data *my_sched_data;
+	struct io_sched_data *sched_data;
+
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
+	int ioprio_changed;
+};
+
+/*
+ * A common structure embedded by every io scheduler into their respective
+ * queue structure.
+ */
+struct io_queue {
+	struct io_entity entity;
+	atomic_t ref;
+	unsigned int flags;
+
+	/* Pointer to generic elevator data structure */
+	struct elv_fq_data *efqd;
+	struct list_head queue_list;
+	pid_t pid;
+
+	/* Number of requests queued on this io queue */
+	unsigned long nr_queued;
+
+	/* Requests dispatched from this queue */
+	int dispatched;
+
+	/* Keep a track of think time of processes in this queue */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	unsigned long slice_end;
+
+	/* Pointer to io scheduler's queue */
+	void *sched_queue;
+
+	/*
+	 * keeps a track of total slice time assigned to a queue for
+	 * debugging purposes.
+	 */
+	unsigned long total_service;
+};
+
+struct io_group {
+	struct io_sched_data sched_data;
+
+	/* async_queue and idle_queue are used only for cfq */
+	struct io_queue *async_queue[2][IOPRIO_BE_NR];
+	struct io_queue *async_idle_queue;
+};
+
+struct elv_fq_data {
+	struct io_group *root_group;
+
+	/* List of io queues on idle tree. */
+	struct list_head idle_list;
+
+	struct request_queue *queue;
+	unsigned int busy_queues;
+	/*
+	 * Used to track any pending rt requests so we can pre-empt current
+	 * non-RT cfqq in service when this value is non-zero.
+	 */
+	unsigned int busy_rt_queues;
+
+	/* Number of requests queued */
+	int rq_queued;
+
+	/* Pointer to the ioscheduler queue being served */
+	void *active_queue;
+
+	int rq_in_driver;
+	int hw_tag;
+	int hw_tag_samples;
+	int rq_in_driver_peak;
+
+	/*
+	 * elevator fair queuing layer has the capability to provide idling
+	 * for ensuring fairness for processes doing dependent reads.
+	 * This might be needed to ensure fairness among two processes doing
+	 * synchronous reads in two different cgroups. noop and deadline don't
+	 * have any notion of anticipation/idling. As of now, these are the
+	 * users of this functionality.
+	 */
+	unsigned int elv_slice_idle;
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
+
+	unsigned int elv_slice[2];
+};
+
+extern int elv_slice_idle;
+extern int elv_slice_async;
+
+/* Logging facilities. */
+#define elv_log_ioq(efqd, ioq, fmt, args...) \
+	blk_add_trace_msg((efqd)->queue, "%d" fmt, (ioq)->pid, ##args)
+
+#define elv_log(efqd, fmt, args...) \
+	blk_add_trace_msg((efqd)->queue, "" fmt, ##args)
+
+#define ioq_sample_valid(samples)   ((samples) > 80)
+
+/* Some shared queue flag manipulation functions among elevators */
+
+enum elv_queue_state_flags {
+	ELV_QUEUE_FLAG_busy = 0,          /* has requests or is under service */
+	ELV_QUEUE_FLAG_sync,              /* synchronous queue */
+	ELV_QUEUE_FLAG_idle_window,	  /* elevator slice idling enabled */
+	ELV_QUEUE_FLAG_wait_request,	  /* waiting for a request */
+	ELV_QUEUE_FLAG_slice_new,	  /* no requests dispatched in slice */
+	ELV_QUEUE_FLAG_NR,
+};
+
+#define ELV_IO_QUEUE_FLAG_FNS(name)					\
+static inline void elv_mark_ioq_##name(struct io_queue *ioq)		\
+{                                                                       \
+	(ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name);			\
+}                                                                       \
+static inline void elv_clear_ioq_##name(struct io_queue *ioq)		\
+{                                                                       \
+	(ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name);			\
+}                                                                       \
+static inline int elv_ioq_##name(struct io_queue *ioq)         		\
+{                                                                       \
+	return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0;	\
+}
+
+ELV_IO_QUEUE_FLAG_FNS(busy)
+ELV_IO_QUEUE_FLAG_FNS(sync)
+ELV_IO_QUEUE_FLAG_FNS(wait_request)
+ELV_IO_QUEUE_FLAG_FNS(idle_window)
+ELV_IO_QUEUE_FLAG_FNS(slice_new)
+
+static inline struct io_service_tree *
+io_entity_service_tree(struct io_entity *entity)
+{
+	struct io_sched_data *sched_data = entity->sched_data;
+	unsigned int idx = entity->ioprio_class - 1;
+
+	BUG_ON(idx >= IO_IOPRIO_CLASSES);
+	BUG_ON(sched_data == NULL);
+
+	return sched_data->service_tree + idx;
+}
+
+/* A request got dispatched from the io_queue. Do the accounting. */
+static inline void elv_ioq_request_dispatched(struct io_queue *ioq)
+{
+	ioq->dispatched++;
+}
+
+static inline int elv_ioq_slice_used(struct io_queue *ioq)
+{
+	if (elv_ioq_slice_new(ioq))
+		return 0;
+	if (time_before(jiffies, ioq->slice_end))
+		return 0;
+
+	return 1;
+}
+
+/* How many request are currently dispatched from the queue */
+static inline int elv_ioq_nr_dispatched(struct io_queue *ioq)
+{
+	return ioq->dispatched;
+}
+
+static inline pid_t elv_ioq_pid(struct io_queue *ioq)
+{
+	return ioq->pid;
+}
+
+static inline unsigned long elv_ioq_ttime_mean(struct io_queue *ioq)
+{
+	return ioq->ttime_mean;
+}
+
+static inline unsigned long elv_ioq_sample_valid(struct io_queue *ioq)
+{
+	return ioq_sample_valid(ioq->ttime_samples);
+}
+
+static inline void elv_get_ioq(struct io_queue *ioq)
+{
+	atomic_inc(&ioq->ref);
+}
+
+static inline void elv_ioq_set_slice_end(struct io_queue *ioq,
+						unsigned long slice_end)
+{
+	ioq->slice_end = slice_end;
+}
+
+static inline int elv_ioq_class_idle(struct io_queue *ioq)
+{
+	return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
+}
+
+static inline int elv_ioq_class_rt(struct io_queue *ioq)
+{
+	return ioq->entity.ioprio_class == IOPRIO_CLASS_RT;
+}
+
+static inline int elv_ioq_ioprio_class(struct io_queue *ioq)
+{
+	return ioq->entity.new_ioprio_class;
+}
+
+static inline int elv_ioq_ioprio(struct io_queue *ioq)
+{
+	return ioq->entity.new_ioprio;
+}
+
+static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq,
+						int ioprio_class)
+{
+	ioq->entity.new_ioprio_class = ioprio_class;
+	ioq->entity.ioprio_changed = 1;
+}
+
+static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
+{
+	ioq->entity.new_ioprio = ioprio;
+	ioq->entity.ioprio_changed = 1;
+}
+
+static inline void *ioq_sched_queue(struct io_queue *ioq)
+{
+	if (ioq)
+		return ioq->sched_queue;
+	return NULL;
+}
+
+static inline struct io_group *ioq_to_io_group(struct io_queue *ioq)
+{
+	return container_of(ioq->entity.sched_data, struct io_group,
+						sched_data);
+}
+
+/* Functions used by blksysfs.c */
+extern ssize_t elv_slice_idle_show(struct request_queue *q, char *name);
+extern ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
+						size_t count);
+
+/* Functions used by elevator.c */
+extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e);
+extern void elv_exit_fq_data(struct elevator_queue *e);
+extern void elv_exit_fq_data_post(struct elevator_queue *e);
+
+extern void elv_ioq_request_add(struct request_queue *q, struct request *rq);
+extern void elv_ioq_request_removed(struct elevator_queue *e,
+					struct request *rq);
+extern void elv_fq_dispatched_request(struct elevator_queue *e,
+					struct request *rq);
+
+extern void elv_fq_activate_rq(struct request_queue *q, struct request *rq);
+extern void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq);
+
+extern void elv_ioq_completed_request(struct request_queue *q,
+				struct request *rq);
+
+extern void *elv_fq_select_ioq(struct request_queue *q, int force);
+extern struct io_queue *rq_ioq(struct request *rq);
+
+/* Functions used by io schedulers */
+extern void elv_put_ioq(struct io_queue *ioq);
+extern void __elv_ioq_slice_expired(struct request_queue *q,
+					struct io_queue *ioq, int timed_out);
+extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
+		void *sched_queue, int ioprio_class, int ioprio, int is_sync);
+extern void elv_schedule_dispatch(struct request_queue *q);
+extern int elv_hw_tag(struct elevator_queue *e);
+extern void *elv_active_sched_queue(struct elevator_queue *e);
+extern int elv_mod_idle_slice_timer(struct elevator_queue *eq,
+					unsigned long expires);
+extern int elv_del_idle_slice_timer(struct elevator_queue *eq);
+extern unsigned int elv_get_slice_idle(struct elevator_queue *eq);
+extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
+					int ioprio);
+extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
+					int ioprio, struct io_queue *ioq);
+extern struct io_group *io_lookup_io_group_current(struct request_queue *q);
+extern int elv_nr_busy_ioq(struct elevator_queue *e);
+extern int elv_nr_busy_rt_ioq(struct elevator_queue *e);
+extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask);
+extern void elv_free_ioq(struct io_queue *ioq);
+
+#else /* CONFIG_ELV_FAIR_QUEUING */
+
+static inline int elv_init_fq_data(struct request_queue *q,
+					struct elevator_queue *e)
+{
+	return 0;
+}
+
+static inline void elv_exit_fq_data(struct elevator_queue *e) {}
+static inline void elv_exit_fq_data_post(struct elevator_queue *e) {}
+
+static inline void elv_fq_activate_rq(struct request_queue *q,
+					struct request *rq)
+{
+}
+
+static inline void elv_fq_deactivate_rq(struct request_queue *q,
+					struct request *rq)
+{
+}
+
+static inline void elv_fq_dispatched_request(struct elevator_queue *e,
+						struct request *rq)
+{
+}
+
+static inline void elv_ioq_request_removed(struct elevator_queue *e,
+						struct request *rq)
+{
+}
+
+static inline void elv_ioq_request_add(struct request_queue *q,
+					struct request *rq)
+{
+}
+
+static inline void elv_ioq_completed_request(struct request_queue *q,
+						struct request *rq)
+{
+}
+
+static inline void *ioq_sched_queue(struct io_queue *ioq) { return NULL; }
+static inline struct io_queue *rq_ioq(struct request *rq) { return NULL; }
+static inline void *elv_fq_select_ioq(struct request_queue *q, int force)
+{
+	return NULL;
+}
+#endif /* CONFIG_ELV_FAIR_QUEUING */
+#endif /* _BFQ_SCHED_H */
diff --git a/block/elevator.c b/block/elevator.c
index 98259ed..7a3a7e9 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -231,6 +231,9 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
 	for (i = 0; i < ELV_HASH_ENTRIES; i++)
 		INIT_HLIST_HEAD(&eq->hash[i]);
 
+	if (elv_init_fq_data(q, eq))
+		goto err;
+
 	return eq;
 err:
 	kfree(eq);
@@ -301,9 +304,11 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
+	elv_exit_fq_data(e);
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
 	e->ops = NULL;
+	elv_exit_fq_data_post(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -314,6 +319,8 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	elv_fq_activate_rq(q, rq);
+
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
 }
@@ -322,6 +329,8 @@ static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	elv_fq_deactivate_rq(q, rq);
+
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
 }
@@ -446,6 +455,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	elv_rqhash_del(q, rq);
 
 	q->nr_sorted--;
+	elv_fq_dispatched_request(q->elevator, rq);
 
 	boundary = q->end_sector;
 	stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
@@ -486,6 +496,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 	elv_rqhash_del(q, rq);
 
 	q->nr_sorted--;
+	elv_fq_dispatched_request(q->elevator, rq);
 
 	q->end_sector = rq_end_sector(rq);
 	q->boundary_rq = rq;
@@ -553,6 +564,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	elv_rqhash_del(q, next);
 
 	q->nr_sorted--;
+	elv_ioq_request_removed(e, next);
 	q->last_merge = rq;
 }
 
@@ -632,12 +644,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 				q->last_merge = rq;
 		}
 
-		/*
-		 * Some ioscheds (cfq) run q->request_fn directly, so
-		 * rq cannot be accessed after calling
-		 * elevator_add_req_fn.
-		 */
 		q->elevator->ops->elevator_add_req_fn(q, rq);
+		elv_ioq_request_add(q, rq);
 		break;
 
 	case ELEVATOR_INSERT_REQUEUE:
@@ -847,13 +855,12 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 
 int elv_queue_empty(struct request_queue *q)
 {
-	struct elevator_queue *e = q->elevator;
-
 	if (!list_empty(&q->queue_head))
 		return 0;
 
-	if (e->ops->elevator_queue_empty_fn)
-		return e->ops->elevator_queue_empty_fn(q);
+	/* Hopefully nr_sorted works and no need to call queue_empty_fn */
+	if (q->nr_sorted)
+		return 0;
 
 	return 1;
 }
@@ -928,8 +935,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight--;
-		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
-			e->ops->elevator_completed_req_fn(q, rq);
+		if (blk_sorted_rq(rq)) {
+			if (e->ops->elevator_completed_req_fn)
+				e->ops->elevator_completed_req_fn(q, rq);
+			elv_ioq_completed_request(q, rq);
+		}
 	}
 
 	/*
@@ -1228,3 +1238,17 @@ struct request *elv_rb_latter_request(struct request_queue *q,
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_latter_request);
+
+/* Get the io scheduler queue pointer. For cfq, it is stored in rq->ioq*/
+void *elv_get_sched_queue(struct request_queue *q, struct request *rq)
+{
+	return ioq_sched_queue(rq_ioq(rq));
+}
+EXPORT_SYMBOL(elv_get_sched_queue);
+
+/* Select an ioscheduler queue to dispatch request from. */
+void *elv_select_sched_queue(struct request_queue *q, int force)
+{
+	return ioq_sched_queue(elv_fq_select_ioq(q, force));
+}
+EXPORT_SYMBOL(elv_select_sched_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6ba..cf02216 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -234,6 +234,11 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+	/* io queue request belongs to */
+	struct io_queue *ioq;
+#endif
 };
 
 static inline unsigned short req_get_ioprio(struct request *req)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 7a20425..6f2dea5 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -2,6 +2,7 @@
 #define _LINUX_ELEVATOR_H
 
 #include <linux/percpu.h>
+#include "../../block/elevator-fq.h"
 
 #ifdef CONFIG_BLOCK
 
@@ -29,6 +30,16 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques
 
 typedef void *(elevator_init_fn) (struct request_queue *);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
+#ifdef CONFIG_ELV_FAIR_QUEUING
+typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *);
+typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *);
+typedef void (elevator_active_ioq_reset_fn) (struct request_queue *);
+typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*);
+typedef int (elevator_should_preempt_fn) (struct request_queue*, void*,
+						struct request*);
+typedef int (elevator_update_idle_window_fn) (struct elevator_queue*, void*,
+						struct request*);
+#endif
 
 struct elevator_ops
 {
@@ -56,6 +67,16 @@ struct elevator_ops
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
 	void (*trim)(struct io_context *);
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+	elevator_free_sched_queue_fn *elevator_free_sched_queue_fn;
+	elevator_active_ioq_set_fn *elevator_active_ioq_set_fn;
+	elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn;
+
+	elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn;
+	elevator_should_preempt_fn *elevator_should_preempt_fn;
+	elevator_update_idle_window_fn *elevator_update_idle_window_fn;
+#endif
 };
 
 #define ELV_NAME_MAX	(16)
@@ -76,6 +97,9 @@ struct elevator_type
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+#ifdef CONFIG_ELV_FAIR_QUEUING
+	int elevator_features;
+#endif
 };
 
 /*
@@ -89,6 +113,10 @@ struct elevator_queue
 	struct elevator_type *elevator_type;
 	struct mutex sysfs_lock;
 	struct hlist_head *hash;
+#ifdef CONFIG_ELV_FAIR_QUEUING
+	/* fair queuing data */
+	struct elv_fq_data efqd;
+#endif
 };
 
 /*
@@ -208,5 +236,25 @@ enum {
 	__val;							\
 })
 
+/* iosched can let elevator know their feature set/capability */
+#ifdef CONFIG_ELV_FAIR_QUEUING
+
+/* iosched wants to use fq logic of elevator layer */
+#define	ELV_IOSCHED_NEED_FQ	1
+
+static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
+{
+	return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ;
+}
+
+#else /* ELV_IOSCHED_FAIR_QUEUING */
+
+static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
+{
+	return 0;
+}
+#endif /* ELV_IOSCHED_FAIR_QUEUING */
+extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq);
+extern void *elv_select_sched_queue(struct request_queue *q, int force);
 #endif /* CONFIG_BLOCK */
 #endif
-- 
1.6.0.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/