Date: Mon, 22 Jun 2009 14:16:12 +0530
From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Vivek Goyal <vgoyal@redhat.com>
Cc: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org,
       dm-devel@redhat.com, jens.axboe@oracle.com, nauman@google.com,
       dpshah@google.com, lizf@cn.fujitsu.com, mikew@google.com,
       fchecconi@gmail.com, paolo.valente@unimore.it, ryov@valinux.co.jp,
       fernando@oss.ntt.co.jp, s-uchida@ap.jp.nec.com, taka@valinux.co.jp,
       guijianfeng@cn.fujitsu.com, jmoyer@redhat.com,
       dhaval@linux.vnet.ibm.com, righi.andrea@gmail.com,
       m-ikeda@ds.jp.nec.com, jbaron@redhat.com, agk@redhat.com,
       snitzer@redhat.com, akpm@linux-foundation.org, peterz@infradead.org
Subject: Re: [PATCH 02/20] io-controller: Common flat fair queuing code in
	elevaotor layer
Message-ID: <20090622084612.GD3728@balbir.in.ibm.com>
Reply-To: balbir@linux.vnet.ibm.com
References: <1245443858-8487-1-git-send-email-vgoyal@redhat.com> <1245443858-8487-3-git-send-email-vgoyal@redhat.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=iso-8859-1
Content-Disposition: inline
In-Reply-To: <1245443858-8487-3-git-send-email-vgoyal@redhat.com>
User-Agent: Mutt/1.5.18 (2008-05-17)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 90260
Lines: 2941

* Vivek Goyal <vgoyal@redhat.com> [2009-06-19 16:37:20]:

> This is common fair queuing code in elevator layer. This is controlled by
> config option CONFIG_ELV_FAIR_QUEUING. This patch initially only introduces
> flat fair queuing support where there is only one group, "root group" and all
> the tasks belong to root group.
> 
> This elevator layer changes are backward compatible. That means any ioscheduler
> using old interfaces will continue to work.
> 
> This code is essentially the CFQ code for fair queuing. The primary difference
> is that flat rounding robin algorithm of CFQ has been replaced with BFQ (WF2Q+).
>

The patch is quite long and to be honest requires a long time to
review, which I don't mind. I suspect my frequently diverted mind is
likely to miss a lot in a big patch like this. Could you consider
splitting this further if possible. I think you'll notice the number
of reviews will also increase.
 
> Signed-off-by: Nauman Rafique <nauman@google.com>
> Signed-off-by: Fabio Checconi <fabio@gandalf.sssup.it>
> Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
> Signed-off-by: Aristeu Rozanski <aris@redhat.com>
> Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
> Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
> ---
>  block/Kconfig.iosched    |   13 +
>  block/Makefile           |    1 +
>  block/elevator-fq.c      | 2015 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/elevator-fq.h      |  473 +++++++++++
>  block/elevator.c         |   46 +-
>  include/linux/blkdev.h   |    5 +
>  include/linux/elevator.h |   51 ++
>  7 files changed, 2593 insertions(+), 11 deletions(-)
>  create mode 100644 block/elevator-fq.c
>  create mode 100644 block/elevator-fq.h
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 7e803fc..3398134 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -2,6 +2,19 @@ if BLOCK
> 
>  menu "IO Schedulers"
> 
> +config ELV_FAIR_QUEUING
> +	bool "Elevator Fair Queuing Support"
> +	default n
> +	---help---
> +	  Traditionally only cfq had notion of multiple queues and it did
> +	  fair queuing at its own. With the cgroups and need of controlling
> +	  IO, now even the simple io schedulers like noop, deadline, as will
> +	  have one queue per cgroup and will need hierarchical fair queuing.
> +	  Instead of every io scheduler implementing its own fair queuing
> +	  logic, this option enables fair queuing in elevator layer so that
> +	  other ioschedulers can make use of it.
> +	  If unsure, say N.
> +
>  config IOSCHED_NOOP
>  	bool
>  	default y
> diff --git a/block/Makefile b/block/Makefile
> index e9fa4dd..94bfc6e 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -15,3 +15,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
> 
>  obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
>  obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
> +obj-$(CONFIG_ELV_FAIR_QUEUING)	+= elevator-fq.o
> diff --git a/block/elevator-fq.c b/block/elevator-fq.c
> new file mode 100644
> index 0000000..9357fb0
> --- /dev/null
> +++ b/block/elevator-fq.c
> @@ -0,0 +1,2015 @@
> +/*
> + * BFQ: Hierarchical B-WF2Q+ scheduler.
> + *
> + * Based on ideas and code from CFQ:
> + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
> + *
> + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
> + *		      Paolo Valente <paolo.valente@unimore.it>
> + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
> + * 	              Nauman Rafique <nauman@google.com>
> + */
> +
> +#include <linux/blkdev.h>
> +#include "elevator-fq.h"
> +#include <linux/blktrace_api.h>
> +
> +/* Values taken from cfq */
> +const int elv_slice_sync = HZ / 10;
> +int elv_slice_async = HZ / 25;
> +const int elv_slice_async_rq = 2;
> +int elv_slice_idle = HZ / 125;
> +static struct kmem_cache *elv_ioq_pool;
> +
> +#define ELV_SLICE_SCALE		(5)
> +#define ELV_HW_QUEUE_MIN	(5)
> +#define IO_SERVICE_TREE_INIT   ((struct io_service_tree)		\
> +				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
> +
> +static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
> +					struct io_queue *ioq, int probe);
> +struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
> +						 int extract);
> +
> +static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync,
> +					unsigned short prio)

Why is the return type int and not unsigned int or unsigned long? Can
the return value ever be negative?

> +{
> +	const int base_slice = efqd->elv_slice[sync];
> +
> +	WARN_ON(prio >= IOPRIO_BE_NR);
> +
> +	return base_slice + (base_slice/ELV_SLICE_SCALE * (4 - prio));
> +}
> +
> +static inline int
> +elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq)
> +{
> +	return elv_prio_slice(efqd, elv_ioq_sync(ioq), ioq->entity.ioprio);
> +}
> +
> +/* Mainly the BFQ scheduling code Follows */
> +
> +/*
> + * Shift for timestamp calculations.  This actually limits the maximum
> + * service allowed in one timestamp delta (small shift values increase it),
> + * the maximum total weight that can be used for the queues in the system
> + * (big shift values increase it), and the period of virtual time wraparounds.
> + */
> +#define WFQ_SERVICE_SHIFT	22
> +
> +/**
> + * bfq_gt - compare two timestamps.
> + * @a: first ts.
> + * @b: second ts.
> + *
> + * Return @a > @b, dealing with wrapping correctly.
> + */
> +static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
> +{
> +	return (s64)(a - b) > 0;
> +}
> +

a and b are of type u64, but cast to s64 to deal with wrapping?
Correct?

> +/**
> + * bfq_delta - map service into the virtual time domain.
> + * @service: amount of service.
> + * @weight: scale factor.
> + */
> +static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
> +					bfq_weight_t weight)
> +{
> +	bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
> +

Why is the case required? Does the compiler complain? service is
already of the correct type.

> +	do_div(d, weight);

On a 64 system both d and weight are 64 bit, but on a 32 bit system
weight is 32 bits. do_div expects a 64 bit dividend and 32 bit divisor
- no?

> +	return d;
> +}
> +
> +/**
> + * bfq_calc_finish - assign the finish time to an entity.
> + * @entity: the entity to act upon.
> + * @service: the service to be charged to the entity.
> + */
> +static inline void bfq_calc_finish(struct io_entity *entity,
> +				   bfq_service_t service)
> +{
> +	BUG_ON(entity->weight == 0);
> +
> +	entity->finish = entity->start + bfq_delta(service, entity->weight);
> +}

Should we BUG_ON (entity->finish == entity->start)? Or is that
expected when the entity has no service time left.

> +
> +static inline struct io_queue *io_entity_to_ioq(struct io_entity *entity)
> +{
> +	struct io_queue *ioq = NULL;
> +
> +	BUG_ON(entity == NULL);
> +	if (entity->my_sched_data == NULL)
> +		ioq = container_of(entity, struct io_queue, entity);
> +	return ioq;
> +}
> +
> +/**
> + * bfq_entity_of - get an entity from a node.
> + * @node: the node field of the entity.
> + *
> + * Convert a node pointer to the relative entity.  This is used only
> + * to simplify the logic of some functions and not as the generic
> + * conversion mechanism because, e.g., in the tree walking functions,
> + * the check for a %NULL value would be redundant.
> + */
> +static inline struct io_entity *bfq_entity_of(struct rb_node *node)
> +{
> +	struct io_entity *entity = NULL;
> +
> +	if (node != NULL)
> +		entity = rb_entry(node, struct io_entity, rb_node);
> +
> +	return entity;
> +}
> +
> +/**
> + * bfq_extract - remove an entity from a tree.
> + * @root: the tree root.
> + * @entity: the entity to remove.
> + */
> +static inline void bfq_extract(struct rb_root *root, struct io_entity *entity)
> +{

Extract is not common terminology, why not use bfq_remove()?

> +	BUG_ON(entity->tree != root);
> +
> +	entity->tree = NULL;
> +	rb_erase(&entity->rb_node, root);

Don't you want to make entity->tree = NULL after rb_erase?

> +}
> +
> +/**
> + * bfq_idle_extract - extract an entity from the idle tree.
> + * @st: the service tree of the owning @entity.
> + * @entity: the entity being removed.
> + */
> +static void bfq_idle_extract(struct io_service_tree *st,
> +				struct io_entity *entity)
> +{
> +	struct rb_node *next;
> +
> +	BUG_ON(entity->tree != &st->idle);
> +
> +	if (entity == st->first_idle) {
> +		next = rb_next(&entity->rb_node);

What happens if next is NULL?

> +		st->first_idle = bfq_entity_of(next);
> +	}
> +
> +	if (entity == st->last_idle) {
> +		next = rb_prev(&entity->rb_node);

What happens if next is NULL?

> +		st->last_idle = bfq_entity_of(next);
> +	}
> +
> +	bfq_extract(&st->idle, entity);
> +}
> +
> +/**
> + * bfq_insert - generic tree insertion.
> + * @root: tree root.
> + * @entity: entity to insert.
> + *
> + * This is used for the idle and the active tree, since they are both
> + * ordered by finish time.
> + */
> +static void bfq_insert(struct rb_root *root, struct io_entity *entity)
> +{
> +	struct io_entity *entry;
> +	struct rb_node **node = &root->rb_node;
> +	struct rb_node *parent = NULL;
> +
> +	BUG_ON(entity->tree != NULL);
> +
> +	while (*node != NULL) {
> +		parent = *node;
> +		entry = rb_entry(parent, struct io_entity, rb_node);
> +
> +		if (bfq_gt(entry->finish, entity->finish))
> +			node = &parent->rb_left;
> +		else
> +			node = &parent->rb_right;
> +	}
> +
> +	rb_link_node(&entity->rb_node, parent, node);
> +	rb_insert_color(&entity->rb_node, root);
> +
> +	entity->tree = root;
> +}
> +
> +/**
> + * bfq_update_min - update the min_start field of a entity.
> + * @entity: the entity to update.
> + * @node: one of its children.
> + *
> + * This function is called when @entity may store an invalid value for
> + * min_start due to updates to the active tree.  The function  assumes
> + * that the subtree rooted at @node (which may be its left or its right
> + * child) has a valid min_start value.
> + */
> +static inline void bfq_update_min(struct io_entity *entity,
> +					struct rb_node *node)
> +{
> +	struct io_entity *child;
> +
> +	if (node != NULL) {
> +		child = rb_entry(node, struct io_entity, rb_node);
> +		if (bfq_gt(entity->min_start, child->min_start))
> +			entity->min_start = child->min_start;
> +	}
> +}

So.. we check to see if child's min_time is lesser than the root
entities or node entities and set it to the minimum of the two?
Can you use min_t here?

> +
> +/**
> + * bfq_update_active_node - recalculate min_start.
> + * @node: the node to update.
> + *
> + * @node may have changed position or one of its children may have moved,
> + * this function updates its min_start value.  The left and right subtrees
> + * are assumed to hold a correct min_start value.
> + */
> +static inline void bfq_update_active_node(struct rb_node *node)
> +{
> +	struct io_entity *entity = rb_entry(node, struct io_entity, rb_node);
> +
> +	entity->min_start = entity->start;
> +	bfq_update_min(entity, node->rb_right);
> +	bfq_update_min(entity, node->rb_left);
> +}

I don't like this every much, we set the min_time twice, this can be
easily optimized to look at both left and right child and pick the
minimum.

> +
> +/**
> + * bfq_update_active_tree - update min_start for the whole active tree.
> + * @node: the starting node.
> + *
> + * @node must be the deepest modified node after an update.  This function
> + * updates its min_start using the values held by its children, assuming
> + * that they did not change, and then updates all the nodes that may have
> + * changed in the path to the root.  The only nodes that may have changed
> + * are the ones in the path or their siblings.
> + */
> +static void bfq_update_active_tree(struct rb_node *node)
> +{
> +	struct rb_node *parent;
> +
> +up:
> +	bfq_update_active_node(node);
> +
> +	parent = rb_parent(node);
> +	if (parent == NULL)
> +		return;
> +
> +	if (node == parent->rb_left && parent->rb_right != NULL)
> +		bfq_update_active_node(parent->rb_right);
> +	else if (parent->rb_left != NULL)
> +		bfq_update_active_node(parent->rb_left);
> +
> +	node = parent;
> +	goto up;
> +}
> +

For these functions, take a look at the walk function in the group
scheduler that does update_shares

> +/**
> + * bfq_active_insert - insert an entity in the active tree of its group/device.
> + * @st: the service tree of the entity.
> + * @entity: the entity being inserted.
> + *
> + * The active tree is ordered by finish time, but an extra key is kept
> + * per each node, containing the minimum value for the start times of
> + * its children (and the node itself), so it's possible to search for
> + * the eligible node with the lowest finish time in logarithmic time.
> + */
> +static void bfq_active_insert(struct io_service_tree *st,
> +					struct io_entity *entity)
> +{
> +	struct rb_node *node = &entity->rb_node;
> +
> +	bfq_insert(&st->active, entity);
> +
> +	if (node->rb_left != NULL)
> +		node = node->rb_left;
> +	else if (node->rb_right != NULL)
> +		node = node->rb_right;
> +
> +	bfq_update_active_tree(node);
> +}
> +
> +/**
> + * bfq_ioprio_to_weight - calc a weight from an ioprio.
> + * @ioprio: the ioprio value to convert.
> + */
> +static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
> +{
> +	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
> +	return IOPRIO_BE_NR - ioprio;
> +}
> +
> +void bfq_get_entity(struct io_entity *entity)
> +{
> +	struct io_queue *ioq = io_entity_to_ioq(entity);
> +
> +	if (ioq)
> +		elv_get_ioq(ioq);
> +}
> +
> +void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
> +{
> +	entity->ioprio = entity->new_ioprio;
> +	entity->ioprio_class = entity->new_ioprio_class;
> +	entity->sched_data = &iog->sched_data;
> +}
> +
> +/**
> + * bfq_find_deepest - find the deepest node that an extraction can modify.
> + * @node: the node being removed.
> + *
> + * Do the first step of an extraction in an rb tree, looking for the
> + * node that will replace @node, and returning the deepest node that
> + * the following modifications to the tree can touch.  If @node is the
> + * last node in the tree return %NULL.
> + */
> +static struct rb_node *bfq_find_deepest(struct rb_node *node)
> +{
> +	struct rb_node *deepest;
> +
> +	if (node->rb_right == NULL && node->rb_left == NULL)
> +		deepest = rb_parent(node);

Why is the parent the deepest? Shouldn't node be the deepest?

> +	else if (node->rb_right == NULL)
> +		deepest = node->rb_left;
> +	else if (node->rb_left == NULL)
> +		deepest = node->rb_right;
> +	else {
> +		deepest = rb_next(node);
> +		if (deepest->rb_right != NULL)
> +			deepest = deepest->rb_right;
> +		else if (rb_parent(deepest) != node)
> +			deepest = rb_parent(deepest);
> +	}
> +
> +	return deepest;
> +}

The function is not clear, could you please define deepest node
better?

> +
> +/**
> + * bfq_active_extract - remove an entity from the active tree.
> + * @st: the service_tree containing the tree.
> + * @entity: the entity being removed.
> + */
> +static void bfq_active_extract(struct io_service_tree *st,
> +				struct io_entity *entity)
> +{
> +	struct rb_node *node;
> +
> +	node = bfq_find_deepest(&entity->rb_node);
> +	bfq_extract(&st->active, entity);
> +
> +	if (node != NULL)
> +		bfq_update_active_tree(node);
> +}
> +

Just to check my understanding, every time an active node is
added/removed, we update the min_time of the entire tree.

> +/**
> + * bfq_idle_insert - insert an entity into the idle tree.
> + * @st: the service tree containing the tree.
> + * @entity: the entity to insert.
> + */
> +static void bfq_idle_insert(struct io_service_tree *st,
> +					struct io_entity *entity)
> +{
> +	struct io_entity *first_idle = st->first_idle;
> +	struct io_entity *last_idle = st->last_idle;
> +
> +	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
> +		st->first_idle = entity;
> +	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
> +		st->last_idle = entity;
> +
> +	bfq_insert(&st->idle, entity);
> +}
> +
> +/**
> + * bfq_forget_entity - remove an entity from the wfq trees.
> + * @st: the service tree.
> + * @entity: the entity being removed.
> + *
> + * Update the device status and forget everything about @entity, putting
> + * the device reference to it, if it is a queue.  Entities belonging to
> + * groups are not refcounted.
> + */
> +static void bfq_forget_entity(struct io_service_tree *st,
> +				struct io_entity *entity)
> +{
> +	struct io_queue *ioq = NULL;
> +
> +	BUG_ON(!entity->on_st);
> +	entity->on_st = 0;
> +	st->wsum -= entity->weight;
> +	ioq = io_entity_to_ioq(entity);
> +	if (!ioq)
> +		return;
> +	elv_put_ioq(ioq);
> +}
> +
> +/**
> + * bfq_put_idle_entity - release the idle tree ref of an entity.
> + * @st: service tree for the entity.
> + * @entity: the entity being released.
> + */
> +void bfq_put_idle_entity(struct io_service_tree *st,
> +				struct io_entity *entity)
> +{
> +	bfq_idle_extract(st, entity);
> +	bfq_forget_entity(st, entity);
> +}
> +
> +/**
> + * bfq_forget_idle - update the idle tree if necessary.
> + * @st: the service tree to act upon.
> + *
> + * To preserve the global O(log N) complexity we only remove one entry here;
> + * as the idle tree will not grow indefinitely this can be done safely.
> + */
> +void bfq_forget_idle(struct io_service_tree *st)
> +{
> +	struct io_entity *first_idle = st->first_idle;
> +	struct io_entity *last_idle = st->last_idle;
> +
> +	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
> +	    !bfq_gt(last_idle->finish, st->vtime)) {
> +		/*
> +		 * Active tree is empty. Pull back vtime to finish time of
> +		 * last idle entity on idle tree.
> +		 * Rational seems to be that it reduces the possibility of
> +		 * vtime wraparound (bfq_gt(V-F) < 0).
> +		 */
> +		st->vtime = last_idle->finish;
> +	}
> +
> +	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
> +		bfq_put_idle_entity(st, first_idle);
> +}
> +
> +
> +static struct io_service_tree *
> +__bfq_entity_update_prio(struct io_service_tree *old_st,
> +				struct io_entity *entity)
> +{
> +	struct io_service_tree *new_st = old_st;
> +	struct io_queue *ioq = io_entity_to_ioq(entity);
> +
> +	if (entity->ioprio_changed) {
> +		entity->ioprio = entity->new_ioprio;
> +		entity->ioprio_class = entity->new_ioprio_class;
> +		entity->ioprio_changed = 0;
> +
> +		/*
> +		 * Also update the scaled budget for ioq. Group will get the
> +		 * updated budget once ioq is selected to run next.
> +		 */
> +		if (ioq) {
> +			struct elv_fq_data *efqd = ioq->efqd;
> +			entity->budget = elv_prio_to_slice(efqd, ioq);
> +		}
> +
> +		old_st->wsum -= entity->weight;
> +		entity->weight = bfq_ioprio_to_weight(entity->ioprio);
> +
> +		/*
> +		 * NOTE: here we may be changing the weight too early,
> +		 * this will cause unfairness.  The correct approach
> +		 * would have required additional complexity to defer
> +		 * weight changes to the proper time instants (i.e.,
> +		 * when entity->finish <= old_st->vtime).
> +		 */
> +		new_st = io_entity_service_tree(entity);
> +		new_st->wsum += entity->weight;
> +
> +		if (new_st != old_st)
> +			entity->start = new_st->vtime;
> +	}
> +
> +	return new_st;
> +}
> +
> +/**
> + * __bfq_activate_entity - activate an entity.
> + * @entity: the entity being activated.
> + *
> + * Called whenever an entity is activated, i.e., it is not active and one
> + * of its children receives a new request, or has to be reactivated due to
> + * budget exhaustion.  It uses the current budget of the entity (and the
> + * service received if @entity is active) of the queue to calculate its
> + * timestamps.
> + */
> +static void __bfq_activate_entity(struct io_entity *entity, int add_front)
> +{
> +	struct io_sched_data *sd = entity->sched_data;
> +	struct io_service_tree *st = io_entity_service_tree(entity);
> +
> +	if (entity == sd->active_entity) {
> +		BUG_ON(entity->tree != NULL);
> +		/*
> +		 * If we are requeueing the current entity we have
> +		 * to take care of not charging to it service it has
> +		 * not received.
> +		 */
> +		bfq_calc_finish(entity, entity->service);
> +		entity->start = entity->finish;
> +		sd->active_entity = NULL;
> +	} else if (entity->tree == &st->active) {
> +		/*
> +		 * Requeueing an entity due to a change of some
> +		 * next_active entity below it.  We reuse the old
> +		 * start time.
> +		 */
> +		bfq_active_extract(st, entity);
> +	} else if (entity->tree == &st->idle) {
> +		/*
> +		 * Must be on the idle tree, bfq_idle_extract() will
> +		 * check for that.
> +		 */
> +		bfq_idle_extract(st, entity);
> +		entity->start = bfq_gt(st->vtime, entity->finish) ?
> +				       st->vtime : entity->finish;
> +	} else {
> +		/*
> +		 * The finish time of the entity may be invalid, and
> +		 * it is in the past for sure, otherwise the queue
> +		 * would have been on the idle tree.
> +		 */
> +		entity->start = st->vtime;
> +		st->wsum += entity->weight;
> +		bfq_get_entity(entity);
> +
> +		BUG_ON(entity->on_st);
> +		entity->on_st = 1;
> +	}
> +
> +	st = __bfq_entity_update_prio(st, entity);
> +	/*
> +	 * This is to emulate cfq like functionality where preemption can
> +	 * happen with-in same class, like sync queue preempting async queue
> +	 * May be this is not a very good idea from fairness point of view
> +	 * as preempting queue gains share. Keeping it for now.
> +	 */
> +	if (add_front) {
> +		struct io_entity *next_entity;
> +
> +		/*
> +		 * Determine the entity which will be dispatched next
> +		 * Use sd->next_active once hierarchical patch is applied
> +		 */
> +		next_entity = bfq_lookup_next_entity(sd, 0);
> +
> +		if (next_entity && next_entity != entity) {
> +			struct io_service_tree *new_st;
> +			bfq_timestamp_t delta;
> +
> +			new_st = io_entity_service_tree(next_entity);
> +
> +			/*
> +			 * At this point, both entities should belong to
> +			 * same service tree as cross service tree preemption
> +			 * is automatically taken care by algorithm
> +			 */
> +			BUG_ON(new_st != st);
> +			entity->finish = next_entity->finish - 1;
> +			delta = bfq_delta(entity->budget, entity->weight);
> +			entity->start = entity->finish - delta;
> +			if (bfq_gt(entity->start, st->vtime))
> +				entity->start = st->vtime;
> +		}
> +	} else {
> +		bfq_calc_finish(entity, entity->budget);
> +	}
> +	bfq_active_insert(st, entity);
> +}
> +
> +/**
> + * bfq_activate_entity - activate an entity.
> + * @entity: the entity to activate.
> + */
> +void bfq_activate_entity(struct io_entity *entity, int add_front)
> +{
> +	__bfq_activate_entity(entity, add_front);
> +}
> +
> +/**
> + * __bfq_deactivate_entity - deactivate an entity from its service tree.
> + * @entity: the entity to deactivate.
> + * @requeue: if false, the entity will not be put into the idle tree.
> + *
> + * Deactivate an entity, independently from its previous state.  If the
> + * entity was not on a service tree just return, otherwise if it is on
> + * any scheduler tree, extract it from that tree, and if necessary
> + * and if the caller did not specify @requeue, put it on the idle tree.
> + *
> + */
> +int __bfq_deactivate_entity(struct io_entity *entity, int requeue)
> +{
> +	struct io_sched_data *sd = entity->sched_data;
> +	struct io_service_tree *st = io_entity_service_tree(entity);
> +	int was_active = entity == sd->active_entity;
> +	int ret = 0;
> +
> +	if (!entity->on_st)
> +		return 0;
> +
> +	BUG_ON(was_active && entity->tree != NULL);
> +
> +	if (was_active) {
> +		bfq_calc_finish(entity, entity->service);
> +		sd->active_entity = NULL;
> +	} else if (entity->tree == &st->active)
> +		bfq_active_extract(st, entity);
> +	else if (entity->tree == &st->idle)
> +		bfq_idle_extract(st, entity);
> +	else if (entity->tree != NULL)
> +		BUG();
> +
> +	if (!requeue || !bfq_gt(entity->finish, st->vtime))
> +		bfq_forget_entity(st, entity);
> +	else
> +		bfq_idle_insert(st, entity);
> +
> +	BUG_ON(sd->active_entity == entity);
> +
> +	return ret;
> +}
> +
> +/**
> + * bfq_deactivate_entity - deactivate an entity.
> + * @entity: the entity to deactivate.
> + * @requeue: true if the entity can be put on the idle tree
> + */
> +void bfq_deactivate_entity(struct io_entity *entity, int requeue)
> +{
> +	__bfq_deactivate_entity(entity, requeue);
> +}
> +
> +/**
> + * bfq_update_vtime - update vtime if necessary.
> + * @st: the service tree to act upon.
> + *
> + * If necessary update the service tree vtime to have at least one
> + * eligible entity, skipping to its start time.  Assumes that the
> + * active tree of the device is not empty.
> + *
> + * NOTE: this hierarchical implementation updates vtimes quite often,
> + * we may end up with reactivated tasks getting timestamps after a
> + * vtime skip done because we needed a ->first_active entity on some
> + * intermediate node.
> + */
> +static void bfq_update_vtime(struct io_service_tree *st)
> +{
> +	struct io_entity *entry;
> +	struct rb_node *node = st->active.rb_node;
> +
> +	entry = rb_entry(node, struct io_entity, rb_node);
> +	if (bfq_gt(entry->min_start, st->vtime)) {
> +		st->vtime = entry->min_start;
> +		bfq_forget_idle(st);
> +	}
> +}
> +
> +/**
> + * bfq_first_active - find the eligible entity with the smallest finish time
> + * @st: the service tree to select from.
> + *
> + * This function searches the first schedulable entity, starting from the
> + * root of the tree and going on the left every time on this side there is
> + * a subtree with at least one eligible (start <= vtime) entity.  The path
> + * on the right is followed only if a) the left subtree contains no eligible
> + * entities and b) no eligible entity has been found yet.
> + */
> +static struct io_entity *bfq_first_active_entity(struct io_service_tree *st)
> +{
> +	struct io_entity *entry, *first = NULL;
> +	struct rb_node *node = st->active.rb_node;
> +
> +	while (node != NULL) {
> +		entry = rb_entry(node, struct io_entity, rb_node);
> +left:
> +		if (!bfq_gt(entry->start, st->vtime))
> +			first = entry;
> +
> +		BUG_ON(bfq_gt(entry->min_start, st->vtime));
> +
> +		if (node->rb_left != NULL) {
> +			entry = rb_entry(node->rb_left,
> +					 struct io_entity, rb_node);
> +			if (!bfq_gt(entry->min_start, st->vtime)) {
> +				node = node->rb_left;
> +				goto left;
> +			}
> +		}
> +		if (first != NULL)
> +			break;
> +		node = node->rb_right;

Please help me understand this, we sort the tree by finish time, but
search by vtime, start_time. The worst case could easily be O(N),
right?

> +	}
> +
> +	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
> +	return first;
> +}
> +
> +/**
> + * __bfq_lookup_next_entity - return the first eligible entity in @st.
> + * @st: the service tree.
> + *
> + * Update the virtual time in @st and return the first eligible entity
> + * it contains.
> + */
> +static struct io_entity *__bfq_lookup_next_entity(struct io_service_tree *st)
> +{
> +	struct io_entity *entity;
> +
> +	if (RB_EMPTY_ROOT(&st->active))
> +		return NULL;
> +
> +	bfq_update_vtime(st);
> +	entity = bfq_first_active_entity(st);
> +	BUG_ON(bfq_gt(entity->start, st->vtime));
> +
> +	return entity;
> +}
> +
> +/**
> + * bfq_lookup_next_entity - return the first eligible entity in @sd.
> + * @sd: the sched_data.
> + * @extract: if true the returned entity will be also extracted from @sd.
> + *
> + * NOTE: since we cache the next_active entity at each level of the
> + * hierarchy, the complexity of the lookup can be decreased with
> + * absolutely no effort just returning the cached next_active value;
> + * we prefer to do full lookups to test the consistency of * the data
> + * structures.
> + */
> +struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
> +						 int extract)
> +{
> +	struct io_service_tree *st = sd->service_tree;
> +	struct io_entity *entity;
> +	int i;
> +
> +	/*
> +	 * We should not call lookup when an entity is active, as doing lookup
> +	 * can result in an erroneous vtime jump.
> +	 */
> +	BUG_ON(sd->active_entity != NULL);
> +
> +	for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) {
> +		entity = __bfq_lookup_next_entity(st);
> +		if (entity != NULL) {
> +			if (extract) {
> +				bfq_active_extract(st, entity);
> +				sd->active_entity = entity;
> +			}
> +			break;
> +		}
> +	}
> +
> +	return entity;
> +}
> +
> +void entity_served(struct io_entity *entity, bfq_service_t served)
> +{
> +	struct io_service_tree *st;
> +
> +	st = io_entity_service_tree(entity);
> +	entity->service += served;
> +	BUG_ON(st->wsum == 0);
> +	st->vtime += bfq_delta(served, st->wsum);
> +	bfq_forget_idle(st);

Forget idle checks to see if the st->vtime > first_idle->finish, if so
it pushes the first_idle to later, right?

> +}
> +
> +/**
> + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
> + * @st: the service tree being flushed.
> + */
> +void io_flush_idle_tree(struct io_service_tree *st)
> +{
> +	struct io_entity *entity = st->first_idle;
> +
> +	for (; entity != NULL; entity = st->first_idle)
> +		__bfq_deactivate_entity(entity, 0);
> +}
> +
> +/* Elevator fair queuing function */
> +struct io_queue *rq_ioq(struct request *rq)
> +{
> +	return rq->ioq;
> +}
> +
> +static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
> +{
> +	return e->efqd.active_queue;
> +}
> +
> +void *elv_active_sched_queue(struct elevator_queue *e)
> +{
> +	return ioq_sched_queue(elv_active_ioq(e));
> +}
> +EXPORT_SYMBOL(elv_active_sched_queue);
> +
> +int elv_nr_busy_ioq(struct elevator_queue *e)
> +{
> +	return e->efqd.busy_queues;
> +}
> +EXPORT_SYMBOL(elv_nr_busy_ioq);
> +
> +int elv_hw_tag(struct elevator_queue *e)
> +{
> +	return e->efqd.hw_tag;
> +}
> +EXPORT_SYMBOL(elv_hw_tag);
> +
> +/* Helper functions for operating on elevator idle slice timer */
> +int elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires)
> +{
> +	struct elv_fq_data *efqd = &eq->efqd;
> +
> +	return mod_timer(&efqd->idle_slice_timer, expires);
> +}
> +EXPORT_SYMBOL(elv_mod_idle_slice_timer);
> +
> +int elv_del_idle_slice_timer(struct elevator_queue *eq)
> +{
> +	struct elv_fq_data *efqd = &eq->efqd;
> +
> +	return del_timer(&efqd->idle_slice_timer);
> +}
> +EXPORT_SYMBOL(elv_del_idle_slice_timer);
> +
> +unsigned int elv_get_slice_idle(struct elevator_queue *eq)
> +{
> +	return eq->efqd.elv_slice_idle;
> +}
> +EXPORT_SYMBOL(elv_get_slice_idle);
> +
> +void elv_ioq_served(struct io_queue *ioq, bfq_service_t served)
> +{
> +	entity_served(&ioq->entity, served);
> +}
> +
> +/* Tells whether ioq is queued in root group or not */
> +static inline int is_root_group_ioq(struct request_queue *q,
> +					struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	return (ioq->entity.sched_data == &efqd->root_group->sched_data);
> +}
> +
> +/*
> + * sysfs parts below -->
> + */
> +static ssize_t
> +elv_var_show(unsigned int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +elv_var_store(unsigned int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtoul(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct elv_fq_data *efqd = &e->efqd;				\
> +	unsigned int __data = __VAR;					\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return elv_var_show(__data, (page));				\
> +}
> +SHOW_FUNCTION(elv_slice_idle_show, efqd->elv_slice_idle, 1);
> +EXPORT_SYMBOL(elv_slice_idle_show);
> +SHOW_FUNCTION(elv_slice_sync_show, efqd->elv_slice[1], 1);
> +EXPORT_SYMBOL(elv_slice_sync_show);
> +SHOW_FUNCTION(elv_slice_async_show, efqd->elv_slice[0], 1);
> +EXPORT_SYMBOL(elv_slice_async_show);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
> +{									\
> +	struct elv_fq_data *efqd = &e->efqd;				\
> +	unsigned int __data;						\
> +	int ret = elv_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(elv_slice_idle_store, &efqd->elv_slice_idle, 0, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_idle_store);
> +STORE_FUNCTION(elv_slice_sync_store, &efqd->elv_slice[1], 1, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_sync_store);
> +STORE_FUNCTION(elv_slice_async_store, &efqd->elv_slice[0], 1, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_async_store);
> +#undef STORE_FUNCTION
> +
> +void elv_schedule_dispatch(struct request_queue *q)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	if (elv_nr_busy_ioq(q->elevator)) {
> +		elv_log(efqd, "schedule dispatch");
> +		kblockd_schedule_work(efqd->queue, &efqd->unplug_work);
> +	}
> +}
> +EXPORT_SYMBOL(elv_schedule_dispatch);
> +
> +void elv_kick_queue(struct work_struct *work)
> +{
> +	struct elv_fq_data *efqd =
> +		container_of(work, struct elv_fq_data, unplug_work);
> +	struct request_queue *q = efqd->queue;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(q->queue_lock, flags);
> +	blk_start_queueing(q);
> +	spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +void elv_shutdown_timer_wq(struct elevator_queue *e)
> +{
> +	del_timer_sync(&e->efqd.idle_slice_timer);
> +	cancel_work_sync(&e->efqd.unplug_work);
> +}
> +EXPORT_SYMBOL(elv_shutdown_timer_wq);
> +
> +void elv_ioq_set_prio_slice(struct request_queue *q, struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	ioq->slice_end = jiffies + ioq->entity.budget;
> +	elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->entity.budget);
> +}
> +
> +static void elv_ioq_update_io_thinktime(struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = ioq->efqd;
> +	unsigned long elapsed = jiffies - ioq->last_end_request;
> +	unsigned long ttime = min(elapsed, 2UL * efqd->elv_slice_idle);
> +
> +	ioq->ttime_samples = (7*ioq->ttime_samples + 256) / 8;
> +	ioq->ttime_total = (7*ioq->ttime_total + 256*ttime) / 8;
> +	ioq->ttime_mean = (ioq->ttime_total + 128) / ioq->ttime_samples;
> +}

Not sure I understand the magical 7, 8, 2, 128 and 256. Please help me
understand the algorithm.

> +
> +/*
> + * Disable idle window if the process thinks too long.
> + * This idle flag can also be updated by io scheduler.
> + */
> +static void elv_ioq_update_idle_window(struct elevator_queue *eq,
> +				struct io_queue *ioq, struct request *rq)
> +{
> +	int old_idle, enable_idle;
> +	struct elv_fq_data *efqd = ioq->efqd;
> +
> +	/*
> +	 * Don't idle for async or idle io prio class
> +	 */
> +	if (!elv_ioq_sync(ioq) || elv_ioq_class_idle(ioq))
> +		return;
> +
> +	enable_idle = old_idle = elv_ioq_idle_window(ioq);
> +
> +	if (!efqd->elv_slice_idle)
> +		enable_idle = 0;
> +	else if (ioq_sample_valid(ioq->ttime_samples)) {
> +		if (ioq->ttime_mean > efqd->elv_slice_idle)
> +			enable_idle = 0;
> +		else
> +			enable_idle = 1;
> +	}
> +
> +	/*
> +	 * From think time perspective idle should be enabled. Check with
> +	 * io scheduler if it wants to disable idling based on additional
> +	 * considrations like seek pattern.
> +	 */
> +	if (enable_idle) {
> +		if (eq->ops->elevator_update_idle_window_fn)
> +			enable_idle = eq->ops->elevator_update_idle_window_fn(
> +						eq, ioq->sched_queue, rq);
> +		if (!enable_idle)
> +			elv_log_ioq(efqd, ioq, "iosched disabled idle");
> +	}
> +
> +	if (old_idle != enable_idle) {
> +		elv_log_ioq(efqd, ioq, "idle=%d", enable_idle);
> +		if (enable_idle)
> +			elv_mark_ioq_idle_window(ioq);
> +		else
> +			elv_clear_ioq_idle_window(ioq);
> +	}
> +}
> +
> +struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask)
> +{
> +	struct io_queue *ioq = NULL;
> +
> +	ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node);
> +	return ioq;
> +}
> +EXPORT_SYMBOL(elv_alloc_ioq);
> +
> +void elv_free_ioq(struct io_queue *ioq)
> +{
> +	kmem_cache_free(elv_ioq_pool, ioq);
> +}
> +EXPORT_SYMBOL(elv_free_ioq);
> +
> +int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
> +			void *sched_queue, int ioprio_class, int ioprio,
> +			int is_sync)
> +{
> +	struct elv_fq_data *efqd = &eq->efqd;
> +	struct io_group *iog = io_lookup_io_group_current(efqd->queue);
> +
> +	RB_CLEAR_NODE(&ioq->entity.rb_node);
> +	atomic_set(&ioq->ref, 0);
> +	ioq->efqd = efqd;
> +	elv_ioq_set_ioprio_class(ioq, ioprio_class);
> +	elv_ioq_set_ioprio(ioq, ioprio);
> +	ioq->pid = current->pid;

Is pid used for cgroup association later? I don't see why we save the
pid otherwise? If yes, why not store the cgroup of the current->pid?

> +	ioq->sched_queue = sched_queue;
> +	if (is_sync && !elv_ioq_class_idle(ioq))
> +		elv_mark_ioq_idle_window(ioq);
> +	bfq_init_entity(&ioq->entity, iog);
> +	ioq->entity.budget = elv_prio_to_slice(efqd, ioq);
> +	if (is_sync)
> +		ioq->last_end_request = jiffies;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(elv_init_ioq);
> +
> +void elv_put_ioq(struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = ioq->efqd;
> +	struct elevator_queue *e = container_of(efqd, struct elevator_queue,
> +						efqd);
> +
> +	BUG_ON(atomic_read(&ioq->ref) <= 0);
> +	if (!atomic_dec_and_test(&ioq->ref))
> +		return;
> +	BUG_ON(ioq->nr_queued);
> +	BUG_ON(ioq->entity.tree != NULL);
> +	BUG_ON(elv_ioq_busy(ioq));
> +	BUG_ON(efqd->active_queue == ioq);
> +
> +	/* Can be called by outgoing elevator. Don't use q */
> +	BUG_ON(!e->ops->elevator_free_sched_queue_fn);
> +
> +	e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
> +	elv_log_ioq(efqd, ioq, "put_queue");
> +	elv_free_ioq(ioq);
> +}
> +EXPORT_SYMBOL(elv_put_ioq);
> +
> +void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
> +{
> +	struct io_queue *ioq = *ioq_ptr;
> +
> +	if (ioq != NULL) {
> +		/* Drop the reference taken by the io group */
> +		elv_put_ioq(ioq);
> +		*ioq_ptr = NULL;
> +	}
> +}
> +
> +/*
> + * Normally next io queue to be served is selected from the service tree.
> + * This function allows one to choose a specific io queue to run next
> + * out of order. This is primarily to accomodate the close_cooperator
> + * feature of cfq.
> + *
> + * Currently it is done only for root level as to begin with supporting
> + * close cooperator feature only for root group to make sure default
> + * cfq behavior in flat hierarchy is not changed.
> + */
> +void elv_set_next_ioq(struct request_queue *q, struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_entity *entity = &ioq->entity;
> +	struct io_sched_data *sd = &efqd->root_group->sched_data;
> +	struct io_service_tree *st = io_entity_service_tree(entity);
> +
> +	BUG_ON(efqd->active_queue != NULL || sd->active_entity != NULL);
> +	BUG_ON(!efqd->busy_queues);
> +	BUG_ON(sd != entity->sched_data);
> +	BUG_ON(!st);
> +
> +	bfq_update_vtime(st);
> +	bfq_active_extract(st, entity);
> +	sd->active_entity = entity;
> +	entity->service = 0;
> +	elv_log_ioq(efqd, ioq, "set_next_ioq");
> +}
> +
> +/* Get next queue for service. */
> +struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_entity *entity = NULL;
> +	struct io_queue *ioq = NULL;
> +	struct io_sched_data *sd;
> +
> +	/*
> +	 * We should not call lookup when an entity is active, as doing
> +	 * lookup can result in an erroneous vtime jump.
> +	 */
> +	BUG_ON(efqd->active_queue != NULL);
> +
> +	if (!efqd->busy_queues)
> +		return NULL;
> +
> +	sd = &efqd->root_group->sched_data;
> +	entity = bfq_lookup_next_entity(sd, 1);
> +
> +	BUG_ON(!entity);
> +	if (extract)
> +		entity->service = 0;
> +	ioq = io_entity_to_ioq(entity);
> +
> +	return ioq;
> +}
> +
> +/*
> + * coop tells that io scheduler selected a queue for us and we did not

coop?

> + * select the next queue based on fairness.
> + */
> +static void __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
> +					int coop)
> +{
> +	struct request_queue *q = efqd->queue;
> +
> +	if (ioq) {
> +		elv_log_ioq(efqd, ioq, "set_active, busy=%d",
> +							efqd->busy_queues);
> +		ioq->slice_end = 0;
> +
> +		elv_clear_ioq_wait_request(ioq);
> +		elv_clear_ioq_must_dispatch(ioq);
> +		elv_mark_ioq_slice_new(ioq);
> +
> +		del_timer(&efqd->idle_slice_timer);
> +	}
> +
> +	efqd->active_queue = ioq;
> +
> +	/* Let iosched know if it wants to take some action */
> +	if (ioq) {
> +		if (q->elevator->ops->elevator_active_ioq_set_fn)
> +			q->elevator->ops->elevator_active_ioq_set_fn(q,
> +							ioq->sched_queue, coop);
> +	}
> +}
> +
> +/* Get and set a new active queue for service. */
> +struct io_queue *elv_set_active_ioq(struct request_queue *q,
> +						struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	int coop = 0;
> +
> +	if (!ioq)
> +		ioq = elv_get_next_ioq(q, 1);
> +	else {
> +		elv_set_next_ioq(q, ioq);
> +		/*
> +		 * io scheduler selected the next queue for us. Pass this
> +		 * this info back to io scheudler. cfq currently uses it
> +		 * to reset coop flag on the queue.
> +		 */
> +		coop = 1;
> +	}
> +	__elv_set_active_ioq(efqd, ioq, coop);
> +	return ioq;
> +}
> +
> +void elv_reset_active_ioq(struct elv_fq_data *efqd)
> +{
> +	struct request_queue *q = efqd->queue;
> +	struct io_queue *ioq = elv_active_ioq(efqd->queue->elevator);
> +
> +	if (q->elevator->ops->elevator_active_ioq_reset_fn)
> +		q->elevator->ops->elevator_active_ioq_reset_fn(q,
> +							ioq->sched_queue);
> +	efqd->active_queue = NULL;
> +	del_timer(&efqd->idle_slice_timer);
> +}
> +
> +void elv_activate_ioq(struct io_queue *ioq, int add_front)
> +{
> +	bfq_activate_entity(&ioq->entity, add_front);
> +}
> +
> +void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
> +					int requeue)
> +{
> +	bfq_deactivate_entity(&ioq->entity, requeue);
> +}
> +
> +/* Called when an inactive queue receives a new request. */
> +void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq)
> +{
> +	BUG_ON(elv_ioq_busy(ioq));
> +	BUG_ON(ioq == efqd->active_queue);
> +	elv_log_ioq(efqd, ioq, "add to busy");
> +	elv_activate_ioq(ioq, 0);
> +	elv_mark_ioq_busy(ioq);
> +	efqd->busy_queues++;
> +	if (elv_ioq_class_rt(ioq)) {
> +		struct io_group *iog = ioq_to_io_group(ioq);
> +		iog->busy_rt_queues++;
> +	}
> +}
> +
> +void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq,
> +					int requeue)
> +{
> +	struct elv_fq_data *efqd = &e->efqd;
> +
> +	BUG_ON(!elv_ioq_busy(ioq));
> +	BUG_ON(ioq->nr_queued);
> +	elv_log_ioq(efqd, ioq, "del from busy");
> +	elv_clear_ioq_busy(ioq);
> +	BUG_ON(efqd->busy_queues == 0);
> +	efqd->busy_queues--;
> +	if (elv_ioq_class_rt(ioq)) {
> +		struct io_group *iog = ioq_to_io_group(ioq);
> +		iog->busy_rt_queues--;
> +	}
> +
> +	elv_deactivate_ioq(efqd, ioq, requeue);
> +}
> +
> +/*
> + * Do the accounting. Determine how much service (in terms of time slices)
> + * current queue used and adjust the start, finish time of queue and vtime
> + * of the tree accordingly.
> + *
> + * Determining the service used in terms of time is tricky in certain
> + * situations. Especially when underlying device supports command queuing
> + * and requests from multiple queues can be there at same time, then it
> + * is not clear which queue consumed how much of disk time.
> + *
> + * To mitigate this problem, cfq starts the time slice of the queue only
> + * after first request from the queue has completed. This does not work
> + * very well if we expire the queue before we wait for first and more
> + * request to finish from the queue. For seeky queues, we will expire the
> + * queue after dispatching few requests without waiting and start dispatching
> + * from next queue.
> + *
> + * Not sure how to determine the time consumed by queue in such scenarios.
> + * Currently as a crude approximation, we are charging 25% of time slice
> + * for such cases. A better mechanism is needed for accurate accounting.
> + */
> +void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_entity *entity = &ioq->entity;
> +	long slice_unused = 0, slice_used = 0, slice_overshoot = 0;
> +
> +	assert_spin_locked(q->queue_lock);
> +	elv_log_ioq(efqd, ioq, "slice expired");
> +
> +	if (elv_ioq_wait_request(ioq))
> +		del_timer(&efqd->idle_slice_timer);
> +
> +	elv_clear_ioq_wait_request(ioq);
> +
> +	/*
> +	 * if ioq->slice_end = 0, that means a queue was expired before first
> +	 * reuqest from the queue got completed. Of course we are not planning
> +	 * to idle on the queue otherwise we would not have expired it.
> +	 *
> +	 * Charge for the 25% slice in such cases. This is not the best thing
> +	 * to do but at the same time not very sure what's the next best
> +	 * thing to do.
> +	 *
> +	 * This arises from that fact that we don't have the notion of
> +	 * one queue being operational at one time. io scheduler can dispatch
> +	 * requests from multiple queues in one dispatch round. Ideally for
> +	 * more accurate accounting of exact disk time used by disk, one
> +	 * should dispatch requests from only one queue and wait for all
> +	 * the requests to finish. But this will reduce throughput.
> +	 */
> +	if (!ioq->slice_end)
> +		slice_used = entity->budget/4;
> +	else {
> +		if (time_after(ioq->slice_end, jiffies)) {
> +			slice_unused = ioq->slice_end - jiffies;
> +			if (slice_unused == entity->budget) {
> +				/*
> +				 * queue got expired immediately after
> +				 * completing first request. Charge 25% of
> +				 * slice.
> +				 */
> +				slice_used = entity->budget/4;
> +			} else
> +				slice_used = entity->budget - slice_unused;
> +		} else {
> +			slice_overshoot = jiffies - ioq->slice_end;
> +			slice_used = entity->budget + slice_overshoot;
> +		}
> +	}
> +
> +	elv_log_ioq(efqd, ioq, "sl_end=%lx, jiffies=%lx", ioq->slice_end,
> +			jiffies);
> +	elv_log_ioq(efqd, ioq, "sl_used=%ld, budget=%ld overshoot=%ld",
> +				slice_used, entity->budget, slice_overshoot);
> +	elv_ioq_served(ioq, slice_used);
> +
> +	BUG_ON(ioq != efqd->active_queue);
> +	elv_reset_active_ioq(efqd);
> +
> +	if (!ioq->nr_queued)
> +		elv_del_ioq_busy(q->elevator, ioq, 1);
> +	else
> +		elv_activate_ioq(ioq, 0);
> +}
> +EXPORT_SYMBOL(__elv_ioq_slice_expired);
> +
> +/*
> + *  Expire the ioq.
> + */
> +void elv_ioq_slice_expired(struct request_queue *q)
> +{
> +	struct io_queue *ioq = elv_active_ioq(q->elevator);
> +
> +	if (ioq)
> +		__elv_ioq_slice_expired(q, ioq);
> +}
> +
> +/*
> + * Check if new_cfqq should preempt the currently active queue. Return 0 for
> + * no or if we aren't sure, a 1 will cause a preemption attempt.
> + */
> +int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
> +			struct request *rq)
> +{
> +	struct io_queue *ioq;
> +	struct elevator_queue *eq = q->elevator;
> +	struct io_entity *entity, *new_entity;
> +
> +	ioq = elv_active_ioq(eq);
> +
> +	if (!ioq)
> +		return 0;
> +
> +	entity = &ioq->entity;
> +	new_entity = &new_ioq->entity;
> +
> +	/*
> +	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
> +	 */
> +
> +	if (new_entity->ioprio_class == IOPRIO_CLASS_RT
> +	    && entity->ioprio_class != IOPRIO_CLASS_RT)
> +		return 1;
> +	/*
> +	 * Allow an BE request to pre-empt an ongoing IDLE clas timeslice.
> +	 */
> +
> +	if (new_entity->ioprio_class == IOPRIO_CLASS_BE
> +	    && entity->ioprio_class == IOPRIO_CLASS_IDLE)
> +		return 1;
> +
> +	/*
> +	 * Check with io scheduler if it has additional criterion based on
> +	 * which it wants to preempt existing queue.
> +	 */
> +	if (eq->ops->elevator_should_preempt_fn)
> +		return eq->ops->elevator_should_preempt_fn(q,
> +						ioq_sched_queue(new_ioq), rq);
> +
> +	return 0;
> +}
> +
> +static void elv_preempt_queue(struct request_queue *q, struct io_queue *ioq)
> +{
> +	elv_log_ioq(&q->elevator->efqd, ioq, "preempt");
> +	elv_ioq_slice_expired(q);
> +
> +	/*
> +	 * Put the new queue at the front of the of the current list,
> +	 * so we know that it will be selected next.
> +	 */
> +
> +	elv_activate_ioq(ioq, 1);
> +	elv_ioq_set_slice_end(ioq, 0);
> +	elv_mark_ioq_slice_new(ioq);
> +}
> +
> +void elv_ioq_request_add(struct request_queue *q, struct request *rq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_queue *ioq = rq->ioq;
> +
> +	if (!elv_iosched_fair_queuing_enabled(q->elevator))
> +		return;
> +
> +	BUG_ON(!efqd);
> +	BUG_ON(!ioq);
> +	efqd->rq_queued++;
> +	ioq->nr_queued++;
> +
> +	if (!elv_ioq_busy(ioq))
> +		elv_add_ioq_busy(efqd, ioq);
> +
> +	elv_ioq_update_io_thinktime(ioq);
> +	elv_ioq_update_idle_window(q->elevator, ioq, rq);
> +
> +	if (ioq == elv_active_ioq(q->elevator)) {
> +		/*
> +		 * Remember that we saw a request from this process, but
> +		 * don't start queuing just yet. Otherwise we risk seeing lots
> +		 * of tiny requests, because we disrupt the normal plugging
> +		 * and merging. If the request is already larger than a single
> +		 * page, let it rip immediately. For that case we assume that
> +		 * merging is already done. Ditto for a busy system that
> +		 * has other work pending, don't risk delaying until the
> +		 * idle timer unplug to continue working.
> +		 */
> +		if (elv_ioq_wait_request(ioq)) {
> +			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
> +			    efqd->busy_queues > 1) {
> +				del_timer(&efqd->idle_slice_timer);
> +				blk_start_queueing(q);
> +			}
> +			elv_mark_ioq_must_dispatch(ioq);
> +		}
> +	} else if (elv_should_preempt(q, ioq, rq)) {
> +		/*
> +		 * not the active queue - expire current slice if it is
> +		 * idle and has expired it's mean thinktime or this new queue
> +		 * has some old slice time left and is of higher priority or
> +		 * this new queue is RT and the current one is BE
> +		 */
> +		elv_preempt_queue(q, ioq);
> +		blk_start_queueing(q);
> +	}
> +}
> +
> +void elv_idle_slice_timer(unsigned long data)
> +{
> +	struct elv_fq_data *efqd = (struct elv_fq_data *)data;
> +	struct io_queue *ioq;
> +	unsigned long flags;
> +	struct request_queue *q = efqd->queue;
> +
> +	elv_log(efqd, "idle timer fired");
> +
> +	spin_lock_irqsave(q->queue_lock, flags);
> +
> +	ioq = efqd->active_queue;
> +
> +	if (ioq) {
> +
> +		/*
> +		 * We saw a request before the queue expired, let it through
> +		 */
> +		if (elv_ioq_must_dispatch(ioq))
> +			goto out_kick;
> +
> +		/*
> +		 * expired
> +		 */
> +		if (elv_ioq_slice_used(ioq))
> +			goto expire;
> +
> +		/*
> +		 * only expire and reinvoke request handler, if there are
> +		 * other queues with pending requests
> +		 */
> +		if (!elv_nr_busy_ioq(q->elevator))
> +			goto out_cont;
> +
> +		/*
> +		 * not expired and it has a request pending, let it dispatch
> +		 */
> +		if (ioq->nr_queued)
> +			goto out_kick;
> +	}
> +expire:
> +	elv_ioq_slice_expired(q);
> +out_kick:
> +	elv_schedule_dispatch(q);
> +out_cont:
> +	spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +void elv_ioq_arm_slice_timer(struct request_queue *q)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_queue *ioq = elv_active_ioq(q->elevator);
> +	unsigned long sl;
> +
> +	BUG_ON(!ioq);
> +
> +	/*
> +	 * SSD device without seek penalty, disable idling. But only do so
> +	 * for devices that support queuing, otherwise we still have a problem
> +	 * with sync vs async workloads.
> +	 */
> +	if (blk_queue_nonrot(q) && efqd->hw_tag)
> +		return;
> +
> +	/*
> +	 * still requests with the driver, don't idle
> +	 */
> +	if (efqd->rq_in_driver)
> +		return;
> +
> +	/*
> +	 * idle is disabled, either manually or by past process history
> +	 */
> +	if (!efqd->elv_slice_idle || !elv_ioq_idle_window(ioq))
> +		return;
> +
> +	/*
> +	 * may be iosched got its own idling logic. In that case io
> +	 * schduler will take care of arming the timer, if need be.
> +	 */
> +	if (q->elevator->ops->elevator_arm_slice_timer_fn) {
> +		q->elevator->ops->elevator_arm_slice_timer_fn(q,
> +						ioq->sched_queue);
> +	} else {
> +		elv_mark_ioq_wait_request(ioq);
> +		sl = efqd->elv_slice_idle;
> +		mod_timer(&efqd->idle_slice_timer, jiffies + sl);
> +		elv_log_ioq(efqd, ioq, "arm idle: %lu", sl);
> +	}
> +}
> +
> +/* Common layer function to select the next queue to dispatch from */
> +void *elv_fq_select_ioq(struct request_queue *q, int force)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +	struct io_queue *new_ioq = NULL, *ioq = elv_active_ioq(q->elevator);
> +	struct io_group *iog;
> +
> +	if (!elv_nr_busy_ioq(q->elevator))
> +		return NULL;
> +
> +	if (ioq == NULL)
> +		goto new_queue;
> +
> +	/*
> +	 * Force dispatch. Continue to dispatch from current queue as long
> +	 * as it has requests.
> +	 */
> +	if (unlikely(force)) {
> +		if (ioq->nr_queued)
> +			goto keep_queue;
> +		else
> +			goto expire;
> +	}
> +
> +	/*
> +	 * The active queue has run out of time, expire it and select new.
> +	 */
> +	if (elv_ioq_slice_used(ioq) && !elv_ioq_must_dispatch(ioq))
> +		goto expire;
> +
> +	/*
> +	 * If we have a RT cfqq waiting, then we pre-empt the current non-rt
> +	 * cfqq.
> +	 */
> +	iog = ioq_to_io_group(ioq);
> +
> +	if (!elv_ioq_class_rt(ioq) && iog->busy_rt_queues) {
> +		/*
> +		 * We simulate this as cfqq timed out so that it gets to bank
> +		 * the remaining of its time slice.
> +		 */
> +		elv_log_ioq(efqd, ioq, "preempt");
> +		goto expire;
> +	}
> +
> +	/*
> +	 * The active queue has requests and isn't expired, allow it to
> +	 * dispatch.
> +	 */
> +
> +	if (ioq->nr_queued)
> +		goto keep_queue;
> +
> +	/*
> +	 * If another queue has a request waiting within our mean seek
> +	 * distance, let it run.  The expire code will check for close
> +	 * cooperators and put the close queue at the front of the service
> +	 * tree.
> +	 */
> +	new_ioq = elv_close_cooperator(q, ioq, 0);
> +	if (new_ioq)
> +		goto expire;
> +
> +	/*
> +	 * No requests pending. If the active queue still has requests in
> +	 * flight or is idling for a new request, allow either of these
> +	 * conditions to happen (or time out) before selecting a new queue.
> +	 */
> +
> +	if (timer_pending(&efqd->idle_slice_timer) ||
> +	    (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) {
> +		ioq = NULL;
> +		goto keep_queue;
> +	}
> +
> +expire:
> +	elv_ioq_slice_expired(q);
> +new_queue:
> +	ioq = elv_set_active_ioq(q, new_ioq);
> +keep_queue:
> +	return ioq;
> +}
> +
> +/* A request got removed from io_queue. Do the accounting */
> +void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq)
> +{
> +	struct io_queue *ioq;
> +	struct elv_fq_data *efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(e))
> +		return;
> +
> +	ioq = rq->ioq;
> +	BUG_ON(!ioq);
> +	ioq->nr_queued--;
> +
> +	efqd = ioq->efqd;
> +	BUG_ON(!efqd);
> +	efqd->rq_queued--;
> +
> +	if (elv_ioq_busy(ioq) && (elv_active_ioq(e) != ioq) && !ioq->nr_queued)
> +		elv_del_ioq_busy(e, ioq, 1);
> +}
> +
> +/* A request got dispatched. Do the accounting. */
> +void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq)
> +{
> +	struct io_queue *ioq = rq->ioq;
> +
> +	if (!elv_iosched_fair_queuing_enabled(e))
> +		return;
> +
> +	BUG_ON(!ioq);
> +	elv_ioq_request_dispatched(ioq);
> +	elv_ioq_request_removed(e, rq);
> +	elv_clear_ioq_must_dispatch(ioq);
> +}
> +
> +void elv_fq_activate_rq(struct request_queue *q, struct request *rq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(q->elevator))
> +		return;
> +
> +	efqd->rq_in_driver++;
> +	elv_log_ioq(efqd, rq_ioq(rq), "activate rq, drv=%d",
> +						efqd->rq_in_driver);
> +}
> +
> +void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(q->elevator))
> +		return;
> +
> +	WARN_ON(!efqd->rq_in_driver);
> +	efqd->rq_in_driver--;
> +	elv_log_ioq(efqd, rq_ioq(rq), "deactivate rq, drv=%d",
> +						efqd->rq_in_driver);
> +}
> +
> +/*
> + * Update hw_tag based on peak queue depth over 50 samples under
> + * sufficient load.
> + */
> +static void elv_update_hw_tag(struct elv_fq_data *efqd)
> +{
> +	if (efqd->rq_in_driver > efqd->rq_in_driver_peak)
> +		efqd->rq_in_driver_peak = efqd->rq_in_driver;
> +
> +	if (efqd->rq_queued <= ELV_HW_QUEUE_MIN &&
> +	    efqd->rq_in_driver <= ELV_HW_QUEUE_MIN)
> +		return;
> +
> +	if (efqd->hw_tag_samples++ < 50)
> +		return;
> +
> +	if (efqd->rq_in_driver_peak >= ELV_HW_QUEUE_MIN)
> +		efqd->hw_tag = 1;
> +	else
> +		efqd->hw_tag = 0;
> +
> +	efqd->hw_tag_samples = 0;
> +	efqd->rq_in_driver_peak = 0;
> +}
> +
> +/*
> + * If ioscheduler has functionality of keeping track of close cooperator, check
> + * with it if it has got a closely co-operating queue.
> + */
> +static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
> +					struct io_queue *ioq, int probe)
> +{
> +	struct elevator_queue *e = q->elevator;
> +	struct io_queue *new_ioq = NULL;
> +
> +	/*
> +	 * Currently this feature is supported only for flat hierarchy or
> +	 * root group queues so that default cfq behavior is not changed.
> +	 */
> +	if (!is_root_group_ioq(q, ioq))
> +		return NULL;
> +
> +	if (q->elevator->ops->elevator_close_cooperator_fn)
> +		new_ioq = e->ops->elevator_close_cooperator_fn(q,
> +						ioq->sched_queue, probe);
> +
> +	/* Only select co-operating queue if it belongs to root group */
> +	if (new_ioq && !is_root_group_ioq(q, new_ioq))
> +		return NULL;
> +
> +	return new_ioq;
> +}
> +
> +/* A request got completed from io_queue. Do the accounting. */
> +void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
> +{
> +	const int sync = rq_is_sync(rq);
> +	struct io_queue *ioq;
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(q->elevator))
> +		return;
> +
> +	ioq = rq->ioq;
> +
> +	elv_log_ioq(efqd, ioq, "complete");
> +
> +	elv_update_hw_tag(efqd);
> +
> +	WARN_ON(!efqd->rq_in_driver);
> +	WARN_ON(!ioq->dispatched);
> +	efqd->rq_in_driver--;
> +	ioq->dispatched--;
> +
> +	if (sync)
> +		ioq->last_end_request = jiffies;
> +
> +	/*
> +	 * If this is the active queue, check if it needs to be expired,
> +	 * or if we want to idle in case it has no pending requests.
> +	 */
> +
> +	if (elv_active_ioq(q->elevator) == ioq) {
> +		if (elv_ioq_slice_new(ioq)) {
> +			elv_ioq_set_prio_slice(q, ioq);
> +			elv_clear_ioq_slice_new(ioq);
> +		}
> +		/*
> +		 * If there are no requests waiting in this queue, and
> +		 * there are other queues ready to issue requests, AND
> +		 * those other queues are issuing requests within our
> +		 * mean seek distance, give them a chance to run instead
> +		 * of idling.
> +		 */
> +		if (elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq))
> +			elv_ioq_slice_expired(q);
> +		else if (!ioq->nr_queued && !elv_close_cooperator(q, ioq, 1)
> +			 && sync && !rq_noidle(rq))
> +			elv_ioq_arm_slice_timer(q);
> +	}
> +
> +	if (!efqd->rq_in_driver)
> +		elv_schedule_dispatch(q);
> +}
> +
> +struct io_group *io_lookup_io_group_current(struct request_queue *q)
> +{
> +	struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> +	return efqd->root_group;
> +}
> +EXPORT_SYMBOL(io_lookup_io_group_current);
> +
> +void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
> +					int ioprio)
> +{
> +	struct io_queue *ioq = NULL;
> +
> +	switch (ioprio_class) {
> +	case IOPRIO_CLASS_RT:
> +		ioq = iog->async_queue[0][ioprio];
> +		break;
> +	case IOPRIO_CLASS_BE:
> +		ioq = iog->async_queue[1][ioprio];
> +		break;
> +	case IOPRIO_CLASS_IDLE:
> +		ioq = iog->async_idle_queue;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	if (ioq)
> +		return ioq->sched_queue;
> +	return NULL;
> +}
> +EXPORT_SYMBOL(io_group_async_queue_prio);
> +
> +void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
> +					int ioprio, struct io_queue *ioq)
> +{
> +	switch (ioprio_class) {
> +	case IOPRIO_CLASS_RT:
> +		iog->async_queue[0][ioprio] = ioq;
> +		break;
> +	case IOPRIO_CLASS_BE:
> +		iog->async_queue[1][ioprio] = ioq;
> +		break;
> +	case IOPRIO_CLASS_IDLE:
> +		iog->async_idle_queue = ioq;
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	/*
> +	 * Take the group reference and pin the queue. Group exit will
> +	 * clean it up
> +	 */
> +	elv_get_ioq(ioq);
> +}
> +EXPORT_SYMBOL(io_group_set_async_queue);
> +
> +/*
> + * Release all the io group references to its async queues.
> + */
> +void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
> +{
> +	int i, j;
> +
> +	for (i = 0; i < 2; i++)
> +		for (j = 0; j < IOPRIO_BE_NR; j++)
> +			elv_release_ioq(e, &iog->async_queue[i][j]);
> +
> +	/* Free up async idle queue */
> +	elv_release_ioq(e, &iog->async_idle_queue);
> +}
> +
> +struct io_group *io_alloc_root_group(struct request_queue *q,
> +					struct elevator_queue *e, void *key)
> +{
> +	struct io_group *iog;
> +	int i;
> +
> +	iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
> +	if (iog == NULL)
> +		return NULL;
> +
> +	for (i = 0; i < IO_IOPRIO_CLASSES; i++)
> +		iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
> +
> +	return iog;
> +}
> +
> +void io_free_root_group(struct elevator_queue *e)
> +{
> +	struct io_group *iog = e->efqd.root_group;
> +	struct io_service_tree *st;
> +	int i;
> +
> +	for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
> +		st = iog->sched_data.service_tree + i;
> +		io_flush_idle_tree(st);
> +	}
> +
> +	io_put_io_group_queues(e, iog);
> +	kfree(iog);
> +}
> +
> +static void elv_slab_kill(void)
> +{
> +	/*
> +	 * Caller already ensured that pending RCU callbacks are completed,
> +	 * so we should have no busy allocations at this point.
> +	 */
> +	if (elv_ioq_pool)
> +		kmem_cache_destroy(elv_ioq_pool);
> +}
> +
> +static int __init elv_slab_setup(void)
> +{
> +	elv_ioq_pool = KMEM_CACHE(io_queue, 0);
> +	if (!elv_ioq_pool)
> +		goto fail;
> +
> +	return 0;
> +fail:
> +	elv_slab_kill();
> +	return -ENOMEM;
> +}
> +
> +/* Initialize fair queueing data associated with elevator */
> +int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
> +{
> +	struct io_group *iog;
> +	struct elv_fq_data *efqd = &e->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(e))
> +		return 0;
> +
> +	iog = io_alloc_root_group(q, e, efqd);
> +	if (iog == NULL)
> +		return 1;
> +
> +	efqd->root_group = iog;
> +	efqd->queue = q;
> +
> +	init_timer(&efqd->idle_slice_timer);
> +	efqd->idle_slice_timer.function = elv_idle_slice_timer;
> +	efqd->idle_slice_timer.data = (unsigned long) efqd;
> +
> +	INIT_WORK(&efqd->unplug_work, elv_kick_queue);
> +
> +	efqd->elv_slice[0] = elv_slice_async;
> +	efqd->elv_slice[1] = elv_slice_sync;
> +	efqd->elv_slice_idle = elv_slice_idle;
> +	efqd->hw_tag = 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * elv_exit_fq_data is called before we call elevator_exit_fn. Before
> + * we ask elevator to cleanup its queues, we do the cleanup here so
> + * that all the group and idle tree references to ioq are dropped. Later
> + * during elevator cleanup, ioc reference will be dropped which will lead
> + * to removal of ioscheduler queue as well as associated ioq object.
> + */
> +void elv_exit_fq_data(struct elevator_queue *e)
> +{
> +	struct elv_fq_data *efqd = &e->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(e))
> +		return;
> +
> +	elv_shutdown_timer_wq(e);
> +
> +	BUG_ON(timer_pending(&efqd->idle_slice_timer));
> +	io_free_root_group(e);
> +}
> +
> +/*
> + * This is called after the io scheduler has cleaned up its data structres.
> + * I don't think that this function is required. Right now just keeping it
> + * because cfq cleans up timer and work queue again after freeing up
> + * io contexts. To me io scheduler has already been drained out, and all
> + * the active queue have already been expired so time and work queue should
> + * not been activated during cleanup process.
> + *
> + * Keeping it here for the time being. Will get rid of it later.
> + */
> +void elv_exit_fq_data_post(struct elevator_queue *e)
> +{
> +	struct elv_fq_data *efqd = &e->efqd;
> +
> +	if (!elv_iosched_fair_queuing_enabled(e))
> +		return;
> +
> +	elv_shutdown_timer_wq(e);
> +	BUG_ON(timer_pending(&efqd->idle_slice_timer));
> +}
> +
> +
> +static int __init elv_fq_init(void)
> +{
> +	if (elv_slab_setup())
> +		return -ENOMEM;
> +
> +	/* could be 0 on HZ < 1000 setups */
> +
> +	if (!elv_slice_async)
> +		elv_slice_async = 1;
> +
> +	if (!elv_slice_idle)
> +		elv_slice_idle = 1;
> +
> +	return 0;
> +}
> +
> +module_init(elv_fq_init);
> diff --git a/block/elevator-fq.h b/block/elevator-fq.h
> new file mode 100644
> index 0000000..5b6c1cc
> --- /dev/null
> +++ b/block/elevator-fq.h
> @@ -0,0 +1,473 @@
> +/*
> + * BFQ: data structures and common functions prototypes.
> + *
> + * Based on ideas and code from CFQ:
> + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
> + *
> + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
> + *		      Paolo Valente <paolo.valente@unimore.it>
> + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
> + * 	              Nauman Rafique <nauman@google.com>
> + */
> +
> +#include <linux/blkdev.h>
> +
> +#ifndef _BFQ_SCHED_H
> +#define _BFQ_SCHED_H
> +
> +#define IO_IOPRIO_CLASSES	3
> +
> +typedef u64 bfq_timestamp_t;
> +typedef unsigned long bfq_weight_t;
> +typedef unsigned long bfq_service_t;

Does this abstraction really provide any benefit? Why not directly use
the standard C types, make the code easier to read.

> +struct io_entity;
> +struct io_queue;
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +
> +#define ELV_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, elv_##name##_show, elv_##name##_store)
> +
> +/**
> + * struct bfq_service_tree - per ioprio_class service tree.

Comment is old, does not reflect the newer name

> + * @active: tree for active entities (i.e., those backlogged).
> + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
> + * @first_idle: idle entity with minimum F_i.
> + * @last_idle: idle entity with maximum F_i.
> + * @vtime: scheduler virtual time.
> + * @wsum: scheduler weight sum; active and idle entities contribute to it.
> + *
> + * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
> + * ioprio_class has its own independent scheduler, and so its own
> + * bfq_service_tree.  All the fields are protected by the queue lock
> + * of the containing efqd.
> + */
> +struct io_service_tree {
> +	struct rb_root active;
> +	struct rb_root idle;
> +
> +	struct io_entity *first_idle;
> +	struct io_entity *last_idle;
> +
> +	bfq_timestamp_t vtime;
> +	bfq_weight_t wsum;
> +};
> +
> +/**
> + * struct bfq_sched_data - multi-class scheduler.

Again the naming convention is broken, you need to change several
bfq's to io's :)

> + * @active_entity: entity under service.
> + * @next_active: head-of-the-line entity in the scheduler.
> + * @service_tree: array of service trees, one per ioprio_class.
> + *
> + * bfq_sched_data is the basic scheduler queue.  It supports three
> + * ioprio_classes, and can be used either as a toplevel queue or as
> + * an intermediate queue on a hierarchical setup.
> + * @next_active points to the active entity of the sched_data service
> + * trees that will be scheduled next.
> + *
> + * The supported ioprio_classes are the same as in CFQ, in descending
> + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
> + * Requests from higher priority queues are served before all the
> + * requests from lower priority queues; among requests of the same
> + * queue requests are served according to B-WF2Q+.
> + * All the fields are protected by the queue lock of the containing bfqd.
> + */
> +struct io_sched_data {
> +	struct io_entity *active_entity;
> +	struct io_service_tree service_tree[IO_IOPRIO_CLASSES];
> +};
> +
> +/**
> + * struct bfq_entity - schedulable entity.
> + * @rb_node: service_tree member.
> + * @on_st: flag, true if the entity is on a tree (either the active or
> + *         the idle one of its service_tree).
> + * @finish: B-WF2Q+ finish timestamp (aka F_i).
> + * @start: B-WF2Q+ start timestamp (aka S_i).

Could you mention what key is used in the rb_tree? start, finish
sounds like a range, so my suspicion is that start is used.

> + * @tree: tree the entity is enqueued into; %NULL if not on a tree.
> + * @min_start: minimum start time of the (active) subtree rooted at
> + *             this entity; used for O(log N) lookups into active trees.

Used for O(log N) makes no sense to me, RBTree has a worst case
lookup time of O(log N), but what is the comment saying?

> + * @service: service received during the last round of service.
> + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
> + * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
> + * @parent: parent entity, for hierarchical scheduling.
> + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
> + *                 associated scheduler queue, %NULL on leaf nodes.
> + * @sched_data: the scheduler queue this entity belongs to.
> + * @ioprio: the ioprio in use.
> + * @new_ioprio: when an ioprio change is requested, the new ioprio value
> + * @ioprio_class: the ioprio_class in use.
> + * @new_ioprio_class: when an ioprio_class change is requested, the new
> + *                    ioprio_class value.
> + * @ioprio_changed: flag, true when the user requested an ioprio or
> + *                  ioprio_class change.
> + *
> + * A bfq_entity is used to represent either a bfq_queue (leaf node in the
> + * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
> + * entity belongs to the sched_data of the parent group in the cgroup
> + * hierarchy.  Non-leaf entities have also their own sched_data, stored
> + * in @my_sched_data.
> + *
> + * Each entity stores independently its priority values; this would allow
> + * different weights on different devices, but this functionality is not
> + * exported to userspace by now.  Priorities are updated lazily, first
> + * storing the new values into the new_* fields, then setting the
> + * @ioprio_changed flag.  As soon as there is a transition in the entity
> + * state that allows the priority update to take place the effective and
> + * the requested priority values are synchronized.
> + *
> + * The weight value is calculated from the ioprio to export the same
> + * interface as CFQ.  When dealing with ``well-behaved'' queues (i.e.,
> + * queues that do not spend too much time to consume their budget and
> + * have true sequential behavior, and when there are no external factors
> + * breaking anticipation) the relative weights at each level of the
> + * cgroups hierarchy should be guaranteed.
> + * All the fields are protected by the queue lock of the containing bfqd.
> + */
> +struct io_entity {
> +	struct rb_node rb_node;
> +
> +	int on_st;
> +
> +	bfq_timestamp_t finish;
> +	bfq_timestamp_t start;
> +
> +	struct rb_root *tree;
> +
> +	bfq_timestamp_t min_start;
> +
> +	bfq_service_t service, budget;
> +	bfq_weight_t weight;
> +
> +	struct io_entity *parent;
> +
> +	struct io_sched_data *my_sched_data;
> +	struct io_sched_data *sched_data;
> +
> +	unsigned short ioprio, new_ioprio;
> +	unsigned short ioprio_class, new_ioprio_class;
> +
> +	int ioprio_changed;
> +};
> +
> +/*
> + * A common structure embedded by every io scheduler into their respective
> + * queue structure.
> + */
> +struct io_queue {
> +	struct io_entity entity;

So the io_queue has an abstract entity called io_entity that contains
it QoS parameters? Correct?

> +	atomic_t ref;
> +	unsigned int flags;
> +
> +	/* Pointer to generic elevator data structure */
> +	struct elv_fq_data *efqd;
> +	pid_t pid;

Why do we store the pid?

> +
> +	/* Number of requests queued on this io queue */
> +	unsigned long nr_queued;
> +
> +	/* Requests dispatched from this queue */
> +	int dispatched;
> +
> +	/* Keep a track of think time of processes in this queue */
> +	unsigned long last_end_request;
> +	unsigned long ttime_total;
> +	unsigned long ttime_samples;
> +	unsigned long ttime_mean;
> +
> +	unsigned long slice_end;
> +
> +	/* Pointer to io scheduler's queue */
> +	void *sched_queue;
> +};
> +
> +struct io_group {
> +	struct io_sched_data sched_data;
> +
> +	/* async_queue and idle_queue are used only for cfq */
> +	struct io_queue *async_queue[2][IOPRIO_BE_NR];

Again the 2 is confusing

> +	struct io_queue *async_idle_queue;
> +
> +	/*
> +	 * Used to track any pending rt requests so we can pre-empt current
> +	 * non-RT cfqq in service when this value is non-zero.
> +	 */
> +	unsigned int busy_rt_queues;
> +};
> +
> +struct elv_fq_data {

What does fq stand for?

> +	struct io_group *root_group;
> +
> +	struct request_queue *queue;
> +	unsigned int busy_queues;
> +
> +	/* Number of requests queued */
> +	int rq_queued;
> +
> +	/* Pointer to the ioscheduler queue being served */
> +	void *active_queue;
> +
> +	int rq_in_driver;
> +	int hw_tag;
> +	int hw_tag_samples;
> +	int rq_in_driver_peak;

Some comments of _in_driver and _in_driver_peak would be nice.

> +
> +	/*
> +	 * elevator fair queuing layer has the capability to provide idling
> +	 * for ensuring fairness for processes doing dependent reads.
> +	 * This might be needed to ensure fairness among two processes doing
> +	 * synchronous reads in two different cgroups. noop and deadline don't
> +	 * have any notion of anticipation/idling. As of now, these are the
> +	 * users of this functionality.
> +	 */
> +	unsigned int elv_slice_idle;
> +	struct timer_list idle_slice_timer;
> +	struct work_struct unplug_work;
> +
> +	unsigned int elv_slice[2];

Why [2] makes the code hearder to read

> +};
> +
> +extern int elv_slice_idle;
> +extern int elv_slice_async;
> +
> +/* Logging facilities. */
> +#define elv_log_ioq(efqd, ioq, fmt, args...) \
> +	blk_add_trace_msg((efqd)->queue, "elv%d%c " fmt, (ioq)->pid,	\
> +				elv_ioq_sync(ioq) ? 'S' : 'A', ##args)
> +
> +#define elv_log(efqd, fmt, args...) \
> +	blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args)
> +
> +#define ioq_sample_valid(samples)   ((samples) > 80)
> +
> +/* Some shared queue flag manipulation functions among elevators */
> +
> +enum elv_queue_state_flags {
> +	ELV_QUEUE_FLAG_busy = 0,          /* has requests or is under service */
> +	ELV_QUEUE_FLAG_sync,              /* synchronous queue */
> +	ELV_QUEUE_FLAG_idle_window,	  /* elevator slice idling enabled */
> +	ELV_QUEUE_FLAG_wait_request,	  /* waiting for a request */
> +	ELV_QUEUE_FLAG_must_dispatch,	  /* must be allowed a dispatch */
> +	ELV_QUEUE_FLAG_slice_new,	  /* no requests dispatched in slice */
> +	ELV_QUEUE_FLAG_NR,
> +};
> +
> +#define ELV_IO_QUEUE_FLAG_FNS(name)					\
> +static inline void elv_mark_ioq_##name(struct io_queue *ioq)		\
> +{                                                                       \
> +	(ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name);			\
> +}                                                                       \
> +static inline void elv_clear_ioq_##name(struct io_queue *ioq)		\
> +{                                                                       \
> +	(ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name);			\
> +}                                                                       \
> +static inline int elv_ioq_##name(struct io_queue *ioq)         		\
> +{                                                                       \
> +	return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0;	\
> +}
> +
> +ELV_IO_QUEUE_FLAG_FNS(busy)
> +ELV_IO_QUEUE_FLAG_FNS(sync)
> +ELV_IO_QUEUE_FLAG_FNS(wait_request)
> +ELV_IO_QUEUE_FLAG_FNS(must_dispatch)
> +ELV_IO_QUEUE_FLAG_FNS(idle_window)
> +ELV_IO_QUEUE_FLAG_FNS(slice_new)
> +
> +static inline struct io_service_tree *
> +io_entity_service_tree(struct io_entity *entity)
> +{
> +	struct io_sched_data *sched_data = entity->sched_data;
> +	unsigned int idx = entity->ioprio_class - 1;
> +
> +	BUG_ON(idx >= IO_IOPRIO_CLASSES);
> +	BUG_ON(sched_data == NULL);
> +
> +	return sched_data->service_tree + idx;
> +}
> +
> +/* A request got dispatched from the io_queue. Do the accounting. */
> +static inline void elv_ioq_request_dispatched(struct io_queue *ioq)
> +{
> +	ioq->dispatched++;
> +}
> +
> +static inline int elv_ioq_slice_used(struct io_queue *ioq)
> +{
> +	if (elv_ioq_slice_new(ioq))
> +		return 0;
> +	if (time_before(jiffies, ioq->slice_end))
> +		return 0;
> +
> +	return 1;
> +}
> +
> +/* How many request are currently dispatched from the queue */
> +static inline int elv_ioq_nr_dispatched(struct io_queue *ioq)
> +{
> +	return ioq->dispatched;
> +}
> +
> +/* How many request are currently queued in the queue */
> +static inline int elv_ioq_nr_queued(struct io_queue *ioq)
> +{
> +	return ioq->nr_queued;
> +}
> +
> +static inline void elv_get_ioq(struct io_queue *ioq)
> +{
> +	atomic_inc(&ioq->ref);
> +}
> +
> +static inline void elv_ioq_set_slice_end(struct io_queue *ioq,
> +						unsigned long slice_end)
> +{
> +	ioq->slice_end = slice_end;
> +}
> +
> +static inline int elv_ioq_class_idle(struct io_queue *ioq)
> +{
> +	return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
> +}
> +
> +static inline int elv_ioq_class_rt(struct io_queue *ioq)
> +{
> +	return ioq->entity.ioprio_class == IOPRIO_CLASS_RT;
> +}
> +
> +static inline int elv_ioq_ioprio_class(struct io_queue *ioq)
> +{
> +	return ioq->entity.new_ioprio_class;
> +}
> +
> +static inline int elv_ioq_ioprio(struct io_queue *ioq)
> +{
> +	return ioq->entity.new_ioprio;
> +}
> +
> +static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq,
> +						int ioprio_class)
> +{
> +	ioq->entity.new_ioprio_class = ioprio_class;
> +	ioq->entity.ioprio_changed = 1;
> +}
> +
> +static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
> +{
> +	ioq->entity.new_ioprio = ioprio;
> +	ioq->entity.ioprio_changed = 1;
> +}
> +
> +static inline void *ioq_sched_queue(struct io_queue *ioq)
> +{
> +	if (ioq)
> +		return ioq->sched_queue;
> +	return NULL;
> +}
> +
> +static inline struct io_group *ioq_to_io_group(struct io_queue *ioq)
> +{
> +	return container_of(ioq->entity.sched_data, struct io_group,
> +						sched_data);
> +}
> +
> +extern ssize_t elv_slice_idle_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_idle_store(struct elevator_queue *q, const char *name,
> +						size_t count);
> +extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name,
> +						size_t count);
> +extern ssize_t elv_slice_async_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_async_store(struct elevator_queue *q, const char *name,
> +						size_t count);
> +
> +/* Functions used by elevator.c */
> +extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e);
> +extern void elv_exit_fq_data(struct elevator_queue *e);
> +extern void elv_exit_fq_data_post(struct elevator_queue *e);
> +
> +extern void elv_ioq_request_add(struct request_queue *q, struct request *rq);
> +extern void elv_ioq_request_removed(struct elevator_queue *e,
> +					struct request *rq);
> +extern void elv_fq_dispatched_request(struct elevator_queue *e,
> +					struct request *rq);
> +
> +extern void elv_fq_activate_rq(struct request_queue *q, struct request *rq);
> +extern void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq);
> +
> +extern void elv_ioq_completed_request(struct request_queue *q,
> +				struct request *rq);
> +
> +extern void *elv_fq_select_ioq(struct request_queue *q, int force);
> +extern struct io_queue *rq_ioq(struct request *rq);
> +
> +/* Functions used by io schedulers */
> +extern void elv_put_ioq(struct io_queue *ioq);
> +extern void __elv_ioq_slice_expired(struct request_queue *q,
> +					struct io_queue *ioq);
> +extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
> +		void *sched_queue, int ioprio_class, int ioprio, int is_sync);
> +extern void elv_schedule_dispatch(struct request_queue *q);
> +extern int elv_hw_tag(struct elevator_queue *e);
> +extern void *elv_active_sched_queue(struct elevator_queue *e);
> +extern int elv_mod_idle_slice_timer(struct elevator_queue *eq,
> +					unsigned long expires);
> +extern int elv_del_idle_slice_timer(struct elevator_queue *eq);
> +extern unsigned int elv_get_slice_idle(struct elevator_queue *eq);
> +extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
> +					int ioprio);
> +extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
> +					int ioprio, struct io_queue *ioq);
> +extern struct io_group *io_lookup_io_group_current(struct request_queue *q);
> +extern int elv_nr_busy_ioq(struct elevator_queue *e);
> +extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask);
> +extern void elv_free_ioq(struct io_queue *ioq);
> +
> +#else /* CONFIG_ELV_FAIR_QUEUING */
> +
> +static inline int elv_init_fq_data(struct request_queue *q,
> +					struct elevator_queue *e)
> +{
> +	return 0;
> +}
> +
> +static inline void elv_exit_fq_data(struct elevator_queue *e) {}
> +static inline void elv_exit_fq_data_post(struct elevator_queue *e) {}
> +
> +static inline void elv_fq_activate_rq(struct request_queue *q,
> +					struct request *rq)
> +{
> +}
> +
> +static inline void elv_fq_deactivate_rq(struct request_queue *q,
> +					struct request *rq)
> +{
> +}
> +
> +static inline void elv_fq_dispatched_request(struct elevator_queue *e,
> +						struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_request_removed(struct elevator_queue *e,
> +						struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_request_add(struct request_queue *q,
> +					struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_completed_request(struct request_queue *q,
> +						struct request *rq)
> +{
> +}
> +
> +static inline void *ioq_sched_queue(struct io_queue *ioq) { return NULL; }
> +static inline struct io_queue *rq_ioq(struct request *rq) { return NULL; }
> +static inline void *elv_fq_select_ioq(struct request_queue *q, int force)
> +{
> +	return NULL;
> +}
> +#endif /* CONFIG_ELV_FAIR_QUEUING */
> +#endif /* _BFQ_SCHED_H */
> diff --git a/block/elevator.c b/block/elevator.c
> index 7073a90..c2f07f5 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -231,6 +231,9 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
>  	for (i = 0; i < ELV_HASH_ENTRIES; i++)
>  		INIT_HLIST_HEAD(&eq->hash[i]);
> 
> +	if (elv_init_fq_data(q, eq))
> +		goto err;
> +
>  	return eq;
>  err:
>  	kfree(eq);
> @@ -301,9 +304,11 @@ EXPORT_SYMBOL(elevator_init);
>  void elevator_exit(struct elevator_queue *e)
>  {
>  	mutex_lock(&e->sysfs_lock);
> +	elv_exit_fq_data(e);
>  	if (e->ops->elevator_exit_fn)
>  		e->ops->elevator_exit_fn(e);
>  	e->ops = NULL;
> +	elv_exit_fq_data_post(e);
>  	mutex_unlock(&e->sysfs_lock);
> 
>  	kobject_put(&e->kobj);
> @@ -314,6 +319,8 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
>  {
>  	struct elevator_queue *e = q->elevator;
> 
> +	elv_fq_activate_rq(q, rq);
> +
>  	if (e->ops->elevator_activate_req_fn)
>  		e->ops->elevator_activate_req_fn(q, rq);
>  }
> @@ -322,6 +329,8 @@ static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
>  {
>  	struct elevator_queue *e = q->elevator;
> 
> +	elv_fq_deactivate_rq(q, rq);
> +
>  	if (e->ops->elevator_deactivate_req_fn)
>  		e->ops->elevator_deactivate_req_fn(q, rq);
>  }
> @@ -446,6 +455,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
>  	elv_rqhash_del(q, rq);
> 
>  	q->nr_sorted--;
> +	elv_fq_dispatched_request(q->elevator, rq);
> 
>  	boundary = q->end_sector;
>  	stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
> @@ -486,6 +496,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
>  	elv_rqhash_del(q, rq);
> 
>  	q->nr_sorted--;
> +	elv_fq_dispatched_request(q->elevator, rq);
> 
>  	q->end_sector = rq_end_sector(rq);
>  	q->boundary_rq = rq;
> @@ -553,6 +564,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
>  	elv_rqhash_del(q, next);
> 
>  	q->nr_sorted--;
> +	elv_ioq_request_removed(e, next);
>  	q->last_merge = rq;
>  }
> 
> @@ -657,12 +669,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
>  				q->last_merge = rq;
>  		}
> 
> -		/*
> -		 * Some ioscheds (cfq) run q->request_fn directly, so
> -		 * rq cannot be accessed after calling
> -		 * elevator_add_req_fn.
> -		 */
>  		q->elevator->ops->elevator_add_req_fn(q, rq);
> +		elv_ioq_request_add(q, rq);
>  		break;
> 
>  	case ELEVATOR_INSERT_REQUEUE:
> @@ -872,13 +880,12 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
> 
>  int elv_queue_empty(struct request_queue *q)
>  {
> -	struct elevator_queue *e = q->elevator;
> -
>  	if (!list_empty(&q->queue_head))
>  		return 0;
> 
> -	if (e->ops->elevator_queue_empty_fn)
> -		return e->ops->elevator_queue_empty_fn(q);
> +	/* Hopefully nr_sorted works and no need to call queue_empty_fn */
> +	if (q->nr_sorted)
> +		return 0;
> 
>  	return 1;
>  }
> @@ -953,8 +960,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
>  	 */
>  	if (blk_account_rq(rq)) {
>  		q->in_flight--;
> -		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
> -			e->ops->elevator_completed_req_fn(q, rq);
> +		if (blk_sorted_rq(rq)) {
> +			if (e->ops->elevator_completed_req_fn)
> +				e->ops->elevator_completed_req_fn(q, rq);
> +			elv_ioq_completed_request(q, rq);
> +		}
>  	}
> 
>  	/*
> @@ -1242,3 +1252,17 @@ struct request *elv_rb_latter_request(struct request_queue *q,
>  	return NULL;
>  }
>  EXPORT_SYMBOL(elv_rb_latter_request);
> +
> +/* Get the io scheduler queue pointer. For cfq, it is stored in rq->ioq*/
> +void *elv_get_sched_queue(struct request_queue *q, struct request *rq)
> +{
> +	return ioq_sched_queue(rq_ioq(rq));
> +}
> +EXPORT_SYMBOL(elv_get_sched_queue);
> +
> +/* Select an ioscheduler queue to dispatch request from. */
> +void *elv_select_sched_queue(struct request_queue *q, int force)
> +{
> +	return ioq_sched_queue(elv_fq_select_ioq(q, force));
> +}
> +EXPORT_SYMBOL(elv_select_sched_queue);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index b4f71f1..96a94c9 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -245,6 +245,11 @@ struct request {
> 
>  	/* for bidi */
>  	struct request *next_rq;
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +	/* io queue request belongs to */
> +	struct io_queue *ioq;
> +#endif
>  };
> 
>  static inline unsigned short req_get_ioprio(struct request *req)
> diff --git a/include/linux/elevator.h b/include/linux/elevator.h
> index c59b769..679c149 100644
> --- a/include/linux/elevator.h
> +++ b/include/linux/elevator.h
> @@ -2,6 +2,7 @@
>  #define _LINUX_ELEVATOR_H
> 
>  #include <linux/percpu.h>
> +#include "../../block/elevator-fq.h"
> 
>  #ifdef CONFIG_BLOCK
> 
> @@ -29,6 +30,18 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques
> 
>  typedef void *(elevator_init_fn) (struct request_queue *);
>  typedef void (elevator_exit_fn) (struct elevator_queue *);
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *);
> +typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *, int);
> +typedef void (elevator_active_ioq_reset_fn) (struct request_queue *, void*);
> +typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*);
> +typedef int (elevator_should_preempt_fn) (struct request_queue*, void*,
> +						struct request*);
> +typedef int (elevator_update_idle_window_fn) (struct elevator_queue*, void*,
> +						struct request*);
> +typedef struct io_queue* (elevator_close_cooperator_fn) (struct request_queue*,
> +						void*, int probe);
> +#endif
> 
>  struct elevator_ops
>  {
> @@ -56,6 +69,17 @@ struct elevator_ops
>  	elevator_init_fn *elevator_init_fn;
>  	elevator_exit_fn *elevator_exit_fn;
>  	void (*trim)(struct io_context *);
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +	elevator_free_sched_queue_fn *elevator_free_sched_queue_fn;
> +	elevator_active_ioq_set_fn *elevator_active_ioq_set_fn;
> +	elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn;
> +
> +	elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn;
> +	elevator_should_preempt_fn *elevator_should_preempt_fn;
> +	elevator_update_idle_window_fn *elevator_update_idle_window_fn;
> +	elevator_close_cooperator_fn *elevator_close_cooperator_fn;
> +#endif
>  };
> 
>  #define ELV_NAME_MAX	(16)
> @@ -76,6 +100,9 @@ struct elevator_type
>  	struct elv_fs_entry *elevator_attrs;
>  	char elevator_name[ELV_NAME_MAX];
>  	struct module *elevator_owner;
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +	int elevator_features;
> +#endif
>  };
> 
>  /*
> @@ -89,6 +116,10 @@ struct elevator_queue
>  	struct elevator_type *elevator_type;
>  	struct mutex sysfs_lock;
>  	struct hlist_head *hash;
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +	/* fair queuing data */
> +	struct elv_fq_data efqd;
> +#endif
>  };
> 
>  /*
> @@ -209,5 +240,25 @@ enum {
>  	__val;							\
>  })
> 
> +/* iosched can let elevator know their feature set/capability */
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +
> +/* iosched wants to use fq logic of elevator layer */
> +#define	ELV_IOSCHED_NEED_FQ	1
> +
> +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
> +{
> +	return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ;
> +}
> +
> +#else /* ELV_IOSCHED_FAIR_QUEUING */
> +
> +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
> +{
> +	return 0;
> +}
> +#endif /* ELV_IOSCHED_FAIR_QUEUING */
> +extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq);
> +extern void *elv_select_sched_queue(struct request_queue *q, int force);
>  #endif /* CONFIG_BLOCK */
>  #endif
> -- 
> 1.6.0.6
> 

-- 
	Balbir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/