Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755966Ab2EJDLr (ORCPT ); Wed, 9 May 2012 23:11:47 -0400 Received: from mail-qc0-f174.google.com ([209.85.216.174]:38393 "EHLO mail-qc0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755911Ab2EJDLn (ORCPT ); Wed, 9 May 2012 23:11:43 -0400 Date: Wed, 9 May 2012 23:11:37 -0400 From: Kent Overstreet To: linux-bcache@vger.kernel.org, linux-kernel@vger.kernel.org, dm-devel@redhat.com Cc: tejun@google.com, agk@redhat.com Subject: [Bcache v13 15/16] bcache: Writeback Message-ID: <743f8ef404023c6745f46a27ce91a5c3d9be97b1.1336619038.git.koverstreet@google.com> References: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 13714 Lines: 538 Signed-off-by: Kent Overstreet --- drivers/block/bcache/writeback.c | 518 ++++++++++++++++++++++++++++++++++++++ 1 files changed, 518 insertions(+), 0 deletions(-) create mode 100644 drivers/block/bcache/writeback.c diff --git a/drivers/block/bcache/writeback.c b/drivers/block/bcache/writeback.c new file mode 100644 index 0000000..cfcfe52 --- /dev/null +++ b/drivers/block/bcache/writeback.c @@ -0,0 +1,518 @@ +#include "bcache.h" +#include "btree.h" +#include "debug.h" + +static struct workqueue_struct *dirty_wq; + +static void read_dirty(struct cached_dev *); + +/* Background writeback */ + +static void dirty_init(struct dirty *w) +{ + struct bio *bio = &w->io->bio; + + bio_init(bio); + bio_get(bio); + if (!w->io->d->writeback_percent) + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_size = KEY_SIZE(&w->key) << 9; + bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); + bio->bi_private = w; + bio_map(bio, NULL); +} + +static int dirty_cmp(struct dirty *r, struct dirty *l) +{ + /* Overlapping keys must compare equal */ + if (KEY_START(&r->key) >= l->key.key) + return 1; + if (KEY_START(&l->key) >= r->key.key) + return -1; + return 0; +} + +static int btree_refill_dirty_leaf(struct btree *b, struct btree_op *op, + struct cached_dev *dc) +{ + struct dirty *w; + struct btree_iter iter; + btree_iter_init(b, &iter, &KEY(op->d->id, dc->last_found, 0)); + + /* To protect rb tree access vs. read_dirty() */ + spin_lock(&dc->dirty_lock); + + while (!array_freelist_empty(&dc->dirty_freelist)) { + struct bkey *k = btree_iter_next(&iter); + if (!k || KEY_DEV(k) != op->d->id) + break; + + if (ptr_bad(b, k)) + continue; + + if (KEY_DIRTY(k)) { + w = array_alloc(&dc->dirty_freelist); + + dc->last_found = k->key; + pr_debug("%s", pkey(k)); + w->io = NULL; + bkey_copy(&w->key, k); + SET_KEY_DIRTY(&w->key, false); + + if (RB_INSERT(&dc->dirty, w, node, dirty_cmp)) + array_free(&dc->dirty_freelist, w); + } + } + + spin_unlock(&dc->dirty_lock); + + return 0; +} + +static int btree_refill_dirty(struct btree *b, struct btree_op *op, + struct cached_dev *dc) +{ + int r; + struct btree_iter iter; + btree_iter_init(b, &iter, &KEY(op->d->id, dc->last_found, 0)); + + if (!b->level) + return btree_refill_dirty_leaf(b, op, dc); + + while (!array_freelist_empty(&dc->dirty_freelist)) { + struct bkey *k = btree_iter_next(&iter); + if (!k) + break; + + if (ptr_bad(b, k)) + continue; + + r = btree(refill_dirty, k, b, op, dc); + if (r) { + char buf[BDEVNAME_SIZE]; + bdevname(dc->bdev, buf); + + printk(KERN_WARNING "Error trying to read the btree " + "for background writeback on %s: " + "dirty data may have been lost!\n", buf); + } + + if (KEY_DEV(k) != op->d->id) + break; + + cond_resched(); + } + + return 0; +} + +static void refill_dirty(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, refill_dirty); + uint64_t start; + + struct btree_op op; + btree_op_init_stack(&op); + op.d = &dc->disk; + + if (!atomic_read(&dc->disk.detaching) && + !dc->writeback_running) + return; + + down_write(&dc->writeback_lock); + start = dc->last_found; + + if (!atomic_read(&dc->has_dirty)) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + write_bdev_super(dc, NULL); + up_write(&dc->writeback_lock); + return; + } + + btree_root(refill_dirty, dc->disk.c, &op, dc); + closure_sync(&op.cl); + + pr_debug("found %s keys on %i from %llu to %llu, %i%% used", + RB_EMPTY_ROOT(&dc->dirty) ? "no" : + array_freelist_empty(&dc->dirty_freelist) ? "some" : "a few", + dc->disk.id, start, (uint64_t) dc->last_found, + dc->disk.c->gc_stats.in_use); + + /* Got to the end of the btree */ + if (!array_freelist_empty(&dc->dirty_freelist)) + dc->last_found = 0; + + /* Searched the entire btree - delay for awhile */ + if (!array_freelist_empty(&dc->dirty_freelist) && !start) + queue_delayed_work(dirty_wq, &dc->refill_dirty, + dc->writeback_delay * HZ); + + spin_lock(&dc->dirty_lock); + + if (!RB_EMPTY_ROOT(&dc->dirty)) { + struct dirty *w; + w = RB_FIRST(&dc->dirty, struct dirty, node); + dc->writeback_start = KEY_START(&w->key); + + w = RB_LAST(&dc->dirty, struct dirty, node); + dc->writeback_end = w->key.key; + } else { + dc->writeback_start = 0; + dc->writeback_end = 0; + + if (!start) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + } + } + + up_write(&dc->writeback_lock); + + dc->next_writeback_io = local_clock(); + read_dirty(dc); +} + +bool bcache_in_writeback(struct cached_dev *dc, sector_t offset, unsigned len) +{ + struct dirty *w, s; + s.key = KEY(dc->disk.id, offset + len, len); + + if (offset >= dc->writeback_end || + offset + len <= dc->writeback_start) + return false; + + spin_lock(&dc->dirty_lock); + w = RB_SEARCH(&dc->dirty, s, node, dirty_cmp); + if (w && !w->io) { + rb_erase(&w->node, &dc->dirty); + array_free(&dc->dirty_freelist, w); + w = NULL; + } + + spin_unlock(&dc->dirty_lock); + return w != NULL; +} + +void bcache_writeback_queue(struct cached_dev *d) +{ + queue_delayed_work(dirty_wq, &d->refill_dirty, 0); +} + +void bcache_writeback_add(struct cached_dev *d, unsigned sectors) +{ + atomic_long_add(sectors, &d->disk.sectors_dirty); + + if (!atomic_read(&d->has_dirty) && + !atomic_xchg(&d->has_dirty, 1)) { + if (BDEV_STATE(&d->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&d->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + write_bdev_super(d, NULL); + } + + atomic_inc(&d->count); + queue_delayed_work(dirty_wq, &d->refill_dirty, + d->writeback_delay * HZ); + + if (d->writeback_percent) + schedule_delayed_work(&d->writeback_rate_update, + d->writeback_rate_update_seconds * HZ); + } +} + +static void __update_writeback_rate(struct cached_dev *dc) +{ + struct cache_set *c = dc->disk.c; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_dirty_target = + div_u64(cache_sectors * dc->writeback_percent, 100); + + int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), + c->cached_dev_sectors); + + /* PD controller */ + + int change = 0; + int64_t error; + int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); + int64_t derivative = dirty - dc->disk.sectors_dirty_last; + + dc->disk.sectors_dirty_last = dirty; + + derivative *= dc->writeback_rate_d_term; + derivative = clamp(derivative, -dirty, dirty); + + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + dc->writeback_rate_d_smooth, 0); + + /* Avoid divide by zero */ + if (!target) + goto out; + + error = div64_s64((dirty + derivative - target) << 8, target); + + change = div_s64((dc->writeback_rate * error) >> 8, + dc->writeback_rate_p_term_inverse); + + /* Don't increase writeback rate if the device isn't keeping up */ + if (change > 0 && + time_after64(local_clock(), + dc->next_writeback_io + 10 * NSEC_PER_MSEC)) + change = 0; + + dc->writeback_rate = clamp_t(int64_t, dc->writeback_rate + change, + 1, NSEC_PER_MSEC); +out: + dc->writeback_rate_derivative = derivative; + dc->writeback_rate_change = change; + dc->writeback_rate_target = target; + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); +} + +static void update_writeback_rate(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); + + down_read(&dc->writeback_lock); + + if (atomic_read(&dc->has_dirty) && + dc->writeback_percent) + __update_writeback_rate(dc); + + up_read(&dc->writeback_lock); +} + +static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +{ + uint64_t now = local_clock(); + + if (atomic_read(&dc->disk.detaching) || + !dc->writeback_percent) + return 0; + + /* writeback_rate = sectors per 10 ms */ + dc->next_writeback_io += div_u64(sectors * 10000000ULL, + dc->writeback_rate); + + return time_after64(dc->next_writeback_io, now) + ? div_u64(dc->next_writeback_io - now, NSEC_PER_SEC / HZ) + : 0; +} + +/* Background writeback - IO loop */ + +static void write_dirty_finish(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct dirty *w = io->bio.bi_private; + struct cached_dev *dc = io->d; + struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); + + while (bv-- != w->io->bio.bi_io_vec) + __free_page(bv->bv_page); + + closure_debug_destroy(cl); + kfree(io); + + /* This is kind of a dumb way of signalling errors. */ + if (!KEY_DIRTY(&w->key)) { + struct btree_op op; + btree_op_init_stack(&op); + + op.type = BTREE_REPLACE; + bkey_copy(&op.replace, &w->key); + SET_KEY_DIRTY(&op.replace, true); + + keylist_add(&op.keys, &w->key); + + for (unsigned i = 0; i < KEY_PTRS(&w->key); i++) + atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + + pr_debug("clearing %s", pkey(&w->key)); + bcache_btree_insert(&op, dc->disk.c); + closure_sync(&op.cl); + + atomic_long_inc(op.insert_collision + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); + } + + spin_lock(&dc->dirty_lock); + rb_erase(&w->node, &dc->dirty); + array_free(&dc->dirty_freelist, w); + atomic_dec_bug(&dc->in_flight); + + read_dirty(dc); +} + +static void dirty_endio(struct bio *bio, int error) +{ + struct dirty *w = bio->bi_private; + + if (error) + SET_KEY_DIRTY(&w->key, true); + + bio_put(bio); + closure_put(&w->io->cl); +} + +static void write_dirty(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct dirty *w = io->bio.bi_private; + + dirty_init(w); + io->bio.bi_rw = WRITE|REQ_UNPLUG; + io->bio.bi_sector = KEY_START(&w->key); + io->bio.bi_bdev = io->d->bdev; + io->bio.bi_end_io = dirty_endio; + + trace_bcache_write_dirty(&w->io->bio); + closure_bio_submit(&w->io->bio, cl, io->d->disk.bio_split); + + continue_at(&io->cl, write_dirty_finish, dirty_wq); +} + +static void read_dirty_endio(struct bio *bio, int error) +{ + struct dirty *w = bio->bi_private; + + count_io_errors(PTR_CACHE(w->io->d->disk.c, &w->key, 0), + error, "reading dirty data from cache"); + + dirty_endio(bio, error); +} + +static void read_dirty(struct cached_dev *dc) +{ + unsigned delay = writeback_delay(dc, 0); + struct dirty *w; + struct dirty_io *io; + + /* XXX: if we error, background writeback could stall indefinitely */ + + while (1) { + w = RB_FIRST(&dc->dirty, struct dirty, node); + + while (w && w->io) + w = RB_NEXT(w, node); + + if (!w) + break; + + BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); + + if (delay > 0 && + (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50)) { + queue_delayed_work(dirty_wq, &dc->read_dirty, delay); + break; + } + + dc->last_read = w->key.key; + w->io = ERR_PTR(-EINTR); + spin_unlock(&dc->dirty_lock); + + io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->io = io; + w->io->d = dc; + + dirty_init(w); + w->io->bio.bi_sector = PTR_OFFSET(&w->key, 0); + w->io->bio.bi_bdev = PTR_CACHE(dc->disk.c, + &w->key, 0)->bdev; + w->io->bio.bi_rw = READ|REQ_UNPLUG; + w->io->bio.bi_end_io = read_dirty_endio; + + if (bio_alloc_pages(&w->io->bio, GFP_KERNEL)) + goto err; + + pr_debug("%s", pkey(&w->key)); + + closure_init(&w->io->cl, NULL); + set_closure_fn(&w->io->cl, write_dirty, dirty_wq); + closure_set_stopped(&w->io->cl); + + trace_bcache_read_dirty(&w->io->bio); + closure_bio_submit_put(&w->io->bio, &w->io->cl, + dc->disk.bio_split); + + delay = writeback_delay(dc, KEY_SIZE(&w->key)); + + if (atomic_inc_return(&dc->in_flight) >= 128) + return; + + spin_lock(&dc->dirty_lock); + } + + if (0) { +err: spin_lock(&dc->dirty_lock); + if (!IS_ERR_OR_NULL(w->io)) + kfree(w->io); + rb_erase(&w->node, &dc->dirty); + array_free(&dc->dirty_freelist, w); + } + + if (RB_EMPTY_ROOT(&dc->dirty)) + queue_delayed_work(dirty_wq, &dc->refill_dirty, 0); + + spin_unlock(&dc->dirty_lock); +} + +static void read_dirty_work(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, read_dirty); + + spin_lock(&dc->dirty_lock); + read_dirty(dc); +} + +void bcache_writeback_init_cached_dev(struct cached_dev *d) +{ + INIT_DELAYED_WORK(&d->refill_dirty, refill_dirty); + INIT_DELAYED_WORK(&d->read_dirty, read_dirty_work); + init_rwsem(&d->writeback_lock); + array_allocator_init(&d->dirty_freelist); + + d->dirty = RB_ROOT; + d->writeback_metadata = true; + d->writeback_running = true; + d->writeback_delay = 30; + d->writeback_rate = 1024; + + d->writeback_rate_update_seconds = 30; + d->writeback_rate_d_term = 16; + d->writeback_rate_p_term_inverse = 64; + d->writeback_rate_d_smooth = 8; + + INIT_DELAYED_WORK(&d->writeback_rate_update, update_writeback_rate); + schedule_delayed_work(&d->writeback_rate_update, + d->writeback_rate_update_seconds * HZ); +} + +void bcache_writeback_exit(void) +{ + if (dirty_wq) + destroy_workqueue(dirty_wq); +} + +int __init bcache_writeback_init(void) +{ + dirty_wq = create_singlethread_workqueue("bcache_writeback"); + if (!dirty_wq) + return -ENOMEM; + + return 0; +} -- 1.7.9.rc2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/