From: Jeff Moyer Subject: [PATCH 1/6] block: Implement a blk_yield function to voluntarily give up the I/O scheduler. Date: Fri, 2 Jul 2010 15:58:14 -0400 Message-ID: <1278100699-24132-2-git-send-email-jmoyer@redhat.com> References: <1278100699-24132-1-git-send-email-jmoyer@redhat.com> Cc: axboe@kernel.dk, linux-kernel@vger.kernel.org, vgoyal@redhat.com, tao.ma@oracle.com, Jeff Moyer To: linux-ext4@vger.kernel.org Return-path: In-Reply-To: <1278100699-24132-1-git-send-email-jmoyer@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-Id: linux-ext4.vger.kernel.org This patch implements a blk_yield function to allow a process to voluntarily give up its I/O scheduler time slice. This is desirable for those processes which know that they will be blocked on I/O from another process, such as the file system journal thread. The yield call works by causing the target process to issue I/O in the context of the cfqq of the calling process. Following patches will put calls to blk_yield into jbd and jbd2. Signed-off-by: Jeff Moyer --- block/blk-core.c | 24 +++++++ block/blk-ioc.c | 1 + block/blk-settings.c | 6 ++ block/cfq-iosched.c | 147 +++++++++++++++++++++++++++++++++++++++++++- block/elevator.c | 15 +++++ include/linux/blkdev.h | 4 + include/linux/elevator.h | 3 + include/linux/iocontext.h | 3 + 8 files changed, 199 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f84cce4..e9530eb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -324,6 +324,29 @@ void blk_unplug(struct request_queue *q) } EXPORT_SYMBOL(blk_unplug); +static void generic_yield_iosched(struct request_queue *q, + struct task_struct *tsk) +{ + elv_yield(q, tsk); +} + +/** + * blk_yield() + * @q: request_queue to which we're doing I/O + * @tsk: task to which we're yielding the I/O scheduler + * + * This function should be called by code which knows that it is waiting + * on another thread to perform I/O in order for it to make progress. By + * yielding the I/O scheduler, a potentially significant idling window can + * be bypassed, resulting in better latency and throughput. + */ +void blk_yield(struct request_queue *q, struct task_struct *tsk) +{ + if (q->yield_fn) + q->yield_fn(q, tsk); +} +EXPORT_SYMBOL(blk_yield); + /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question @@ -609,6 +632,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; + q->yield_fn = generic_yield_iosched; q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d22c4c5..3a7b507 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -97,6 +97,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; + ret->on_behalf_of = NULL; } return ret; diff --git a/block/blk-settings.c b/block/blk-settings.c index f5ed5a1..1353767 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -171,6 +171,12 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) } EXPORT_SYMBOL(blk_queue_make_request); +void blk_queue_yield_set(struct request_queue *q, yield_fn *yield) +{ + q->yield_fn = yield; +} +EXPORT_SYMBOL_GPL(blk_queue_yield_set); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dab836e..00b14d4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -87,9 +87,19 @@ struct cfq_rb_root { unsigned total_weight; u64 min_vdisktime; struct rb_node *active; + /* + * The following two fields are used only for the SYNC_NOIDLE + * service tree. Taken together, they are used to determine + * whether or not there is currently a dependent reader doing + * I/O on this service tree. last_expiry records the last time + * that a queue was expired in this service tree, and last_pid + * tells which cfqq->pid it was that was expired. + */ + unsigned long last_expiry; + pid_t last_pid; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ - .count = 0, .min_vdisktime = 0, } + .count = 0, .min_vdisktime = 0, .last_pid = (pid_t)-1, } /* * Per process-grouping structure @@ -147,6 +157,7 @@ struct cfq_queue { struct cfq_queue *new_cfqq; struct cfq_group *cfqg; struct cfq_group *orig_cfqg; + struct io_context *yield_to, *yield_from; }; /* @@ -318,6 +329,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ + CFQ_CFQQ_FLAG_yield, /* Allow another cfqq to run */ }; #define CFQ_CFQQ_FNS(name) \ @@ -347,6 +359,7 @@ CFQ_CFQQ_FNS(coop); CFQ_CFQQ_FNS(split_coop); CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); +CFQ_CFQQ_FNS(yield); #undef CFQ_CFQQ_FNS #ifdef CONFIG_CFQ_GROUP_IOSCHED @@ -1594,6 +1607,9 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfq_mark_cfqq_slice_new(cfqq); cfq_del_timer(cfqd, cfqq); + + if (cfqq->yield_to) + cfqq->yield_to->on_behalf_of = cfqq->yield_from; } cfqd->active_queue = cfqq; @@ -1614,6 +1630,18 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_wait_busy(cfqq); + if (cfq_cfqq_yield(cfqq)) { + if (cfqq->yield_to) { + cfqq->yield_to->on_behalf_of = NULL; + put_io_context(cfqq->yield_to); + cfqq->yield_to = cfqq->yield_from = NULL; + } + cfq_clear_cfqq_yield(cfqq); + } else { + cfqq->service_tree->last_expiry = jiffies; + cfqq->service_tree->last_pid = cfqq->pid; + } + /* * If this cfqq is shared between multiple processes, check to * make sure that those processes are still issuing I/Os within @@ -2118,7 +2146,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max(slice, 2 * cfqd->cfq_slice_idle); slice = max_t(unsigned, slice, CFQ_MIN_TT); - cfq_log(cfqd, "workload slice:%d", slice); + cfq_log(cfqd, "workload:%d slice:%d", cfqd->serving_type, slice); cfqd->workload_expires = jiffies + slice; cfqd->noidle_tree_requires_idle = false; } @@ -2241,6 +2269,96 @@ keep_queue: return cfqq; } +static int expiry_data_valid(struct cfq_rb_root *service_tree) +{ + return (service_tree->last_pid != (pid_t)-1 && + service_tree->last_expiry != 0UL); +} + +static bool cfq_should_yield_now(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + + if (cfqq != cfqd->active_queue) + return false; + + if (cfqd->serving_type != SYNC_NOIDLE_WORKLOAD) + return true; + + /* + * This is the sync-noidle workload. If there is a dependent reader + * executing now, then we should not allow yielding. + */ + if (expiry_data_valid(cfqq->service_tree) && + time_before(cfqq->service_tree->last_expiry + + cfq_slice_idle, jiffies) && + cfqq->service_tree->last_pid != cfqq->pid) + return false; + + return true; +} + +/* + * Explicitly give up this (sync) cfqq's time slice to the specified + * task. This is currently used by the journal code when it is waiting + * on the jbd[2] thread to issue I/O on its behalf. + */ +static void cfq_yield(struct request_queue *q, struct task_struct *tsk) +{ + struct cfq_data *cfqd = q->elevator->elevator_data; + struct cfq_io_context *cic; + struct cfq_queue *cfqq; + + cic = cfq_cic_lookup(cfqd, current->io_context); + if (!cic) + return; + + spin_lock_irq(q->queue_lock); + + cfqq = cic_to_cfqq(cic, 1); + if (!cfqq) { + spin_unlock_irq(q->queue_lock); + return; + } + + if (tsk) { + task_lock(tsk); + /* + * If the task hasn't yet performed any I/O, then it + * will have no io_context. We can't create one for + * another task, so just don't yield the queue in this + * corner case. + */ + if (!tsk->io_context) { + task_unlock(tsk); + goto out_unlock; + } + atomic_long_inc(&tsk->io_context->refcount); + cfqq->yield_to = tsk->io_context; + cfqq->yield_from = current->io_context; + task_unlock(tsk); + } else { + if (cfq_should_yield_now(cfqd, cfqq)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } else + cfq_mark_cfqq_yield(cfqq); + goto out_unlock; + } + + cfq_log_cfqq(cfqd, cfqq, "yielding queue to %d", tsk->pid); + cfq_mark_cfqq_yield(cfqq); + if (cfqd->active_queue == cfqq) + tsk->io_context->on_behalf_of = current->io_context; + + spin_unlock_irq(q->queue_lock); + return; + +out_unlock: + spin_unlock_irq(q->queue_lock); + if (tsk) + put_io_context(tsk->io_context); +} + static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) { int dispatched = 0; @@ -3010,6 +3128,13 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) if (!ioc) return NULL; + if (ioc->on_behalf_of) { + struct io_context *old_ioc = ioc; + ioc = ioc->on_behalf_of; + put_io_context(old_ioc); + atomic_long_inc(&ioc->refcount); + } + cic = cfq_cic_lookup(cfqd, ioc); if (cic) goto out; @@ -3319,6 +3444,9 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) if (cfqq->cfqg->nr_cfqq > 1) return false; + if (cfq_cfqq_yield(cfqq)) + return false; + if (cfq_slice_used(cfqq)) return true; @@ -3401,7 +3529,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); else if (sync && cfqq_empty && - !cfq_close_cooperator(cfqd, cfqq)) { + !cfq_close_cooperator(cfqd, cfqq) && + (!cfq_cfqq_yield(cfqq) || + (cfq_cfqq_yield(cfqq) && cfqq->yield_to))) { + cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); /* * Idling is enabled for SYNC_WORKLOAD. @@ -3548,7 +3679,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_io_context *cic; const int rw = rq_data_dir(rq); - const bool is_sync = rq_is_sync(rq); + bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; unsigned long flags; @@ -3561,6 +3692,13 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) if (!cic) goto queue_fail; + /* + * If another process called blk_yield specifying us as the target, + * then we issue I/O via their sync cfqq. + */ + if (current->io_context->on_behalf_of) + is_sync = 1; + new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { @@ -3973,6 +4111,7 @@ static struct elevator_type iosched_cfq = { .elevator_deactivate_req_fn = cfq_deactivate_request, .elevator_queue_empty_fn = cfq_queue_empty, .elevator_completed_req_fn = cfq_completed_request, + .elevator_yield_fn = cfq_yield, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, .elevator_set_req_fn = cfq_set_request, diff --git a/block/elevator.c b/block/elevator.c index 923a913..aa3c326 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -866,6 +866,21 @@ void elv_completed_request(struct request_queue *q, struct request *rq) } } +/** + * elv_yield() - explicitly give up the I/O scheduler + * @q: request_queue for the device to which we're doing I/O + * @tsk: task_struct of the process to which we're yielding + * + * This function abstracts out the I/O scheduler's yield function. + */ +void elv_yield(struct request_queue *q, struct task_struct *tsk) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->ops->elevator_yield_fn) + e->ops->elevator_yield_fn(q, tsk); +} + #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) static ssize_t diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 09a8402..ef2d10c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -263,6 +263,7 @@ struct request_pm_state typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef void (yield_fn) (struct request_queue *q, struct task_struct *tsk); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); @@ -345,6 +346,7 @@ struct request_queue request_fn_proc *request_fn; make_request_fn *make_request_fn; + yield_fn *yield_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; @@ -837,6 +839,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); extern void blk_unplug(struct request_queue *q); +extern void blk_yield(struct request_queue *q, struct task_struct *tsk); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -929,6 +932,7 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, request_fn_proc *, spinlock_t *); extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); +extern void blk_queue_yield_set(struct request_queue *, yield_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2c958f4..a68b5b1 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -23,6 +23,7 @@ typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); typedef int (elevator_queue_empty_fn) (struct request_queue *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); +typedef void (elevator_yield_fn) (struct request_queue *, struct task_struct *tsk); typedef int (elevator_may_queue_fn) (struct request_queue *, int); typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); @@ -48,6 +49,7 @@ struct elevator_ops elevator_queue_empty_fn *elevator_queue_empty_fn; elevator_completed_req_fn *elevator_completed_req_fn; + elevator_yield_fn *elevator_yield_fn; elevator_request_list_fn *elevator_former_req_fn; elevator_request_list_fn *elevator_latter_req_fn; @@ -111,6 +113,7 @@ extern void elv_bio_merged(struct request_queue *q, struct request *, struct bio *); extern void elv_requeue_request(struct request_queue *, struct request *); extern int elv_queue_empty(struct request_queue *); +extern void elv_yield(struct request_queue *, struct task_struct *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 64d5291..1e3e578 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -54,6 +54,9 @@ struct io_context { struct radix_tree_root radix_root; struct hlist_head cic_list; void *ioc_data; + /* set when another process has yielded its I/O scheduler slice to + * this process */ + struct io_context *on_behalf_of; }; static inline struct io_context *ioc_task_link(struct io_context *ioc) -- 1.6.5.2