Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932288AbaGAVxx (ORCPT ); Tue, 1 Jul 2014 17:53:53 -0400 Received: from mo4-p00-ob.smtp.rzone.de ([81.169.146.163]:30142 "EHLO mo4-p00-ob.smtp.rzone.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758869AbaGAVxt (ORCPT ); Tue, 1 Jul 2014 17:53:49 -0400 X-RZG-AUTH: :OH8QVVOrc/CP6za/qRmbF3BWedPGA1vjs2ejZCzW8NRdwTYefHi0JchBpEUIQvhemkXwbmc= X-RZG-CLASS-ID: mo00 From: Thomas Schoebel-Theuer To: linux-kernel@vger.kernel.org Subject: [PATCH 34/50] mars: add new file drivers/block/mars/xio_bricks/xio_if.c Date: Tue, 1 Jul 2014 23:47:14 +0200 Message-Id: <1404251250-22992-35-git-send-email-tst@schoebel-theuer.de> X-Mailer: git-send-email 2.0.0 In-Reply-To: <1404251250-22992-1-git-send-email-tst@schoebel-theuer.de> References: <1404251250-22992-1-git-send-email-tst@schoebel-theuer.de> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Signed-off-by: Thomas Schoebel-Theuer --- drivers/block/mars/xio_bricks/xio_if.c | 1037 ++++++++++++++++++++++++++++++++ 1 file changed, 1037 insertions(+) create mode 100644 drivers/block/mars/xio_bricks/xio_if.c diff --git a/drivers/block/mars/xio_bricks/xio_if.c b/drivers/block/mars/xio_bricks/xio_if.c new file mode 100644 index 0000000..2c2fa42 --- /dev/null +++ b/drivers/block/mars/xio_bricks/xio_if.c @@ -0,0 +1,1037 @@ +/* (c) 2010 Thomas Schoebel-Theuer / 1&1 Internet AG */ + +/* Interface to a Linux device. + * 1 Input, 0 Outputs. + */ + +#define REQUEST_MERGING +#define ALWAYS_UNPLUG true +#define PREFETCH_LEN PAGE_SIZE + +/* low-level device parameters */ +#define USE_MAX_SECTORS (XIO_MAX_SEGMENT_SIZE >> 9) +#define USE_MAX_PHYS_SEGMENTS (XIO_MAX_SEGMENT_SIZE >> 9) +#define USE_MAX_SEGMENT_SIZE XIO_MAX_SEGMENT_SIZE +#define USE_LOGICAL_BLOCK_SIZE 512 +#define USE_SEGMENT_BOUNDARY (PAGE_SIZE-1) + +#define USE_CONGESTED_FN +#define USE_MERGE_BVEC + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#ifndef XIO_MAJOR /* remove this later: fallback to old prepatch */ +#define XIO_MAJOR MARS_MAJOR +#endif + +/************************ global tuning ***********************/ + +int if_throttle_start_size = 0; /* in kb */ +EXPORT_SYMBOL_GPL(if_throttle_start_size); + +struct xio_limiter if_throttle = { + .lim_max_rate = 5000, +}; +EXPORT_SYMBOL_GPL(if_throttle); + +/************************ own type definitions ***********************/ + +#include + +#define IF_HASH_MAX (PAGE_SIZE / sizeof(struct if_hash_anchor)) +#define IF_HASH_CHUNK (PAGE_SIZE * 32) + +struct if_hash_anchor { + spinlock_t hash_lock; + struct list_head hash_anchor; +}; + +/************************ own static definitions ***********************/ + +/* TODO: check bounds, ensure that free minor numbers are recycled */ +static int device_minor; + +/*************** object * aspect constructors * destructors **************/ + +/************************ linux operations ***********************/ + +#ifdef part_stat_lock +static +void _if_start_io_acct(struct if_input *input, struct bio_wrapper *biow) +{ + struct bio *bio = biow->bio; + const int rw = bio_data_dir(bio); + const int cpu = part_stat_lock(); + + (void)cpu; + part_round_stats(cpu, &input->disk->part0); + part_stat_inc(cpu, &input->disk->part0, ios[rw]); + part_stat_add(cpu, &input->disk->part0, sectors[rw], bio->bi_iter.bi_size >> 9); + part_inc_in_flight(&input->disk->part0, rw); + part_stat_unlock(); + biow->start_time = jiffies; +} + +static +void _if_end_io_acct(struct if_input *input, struct bio_wrapper *biow) +{ + unsigned long duration = jiffies - biow->start_time; + struct bio *bio = biow->bio; + const int rw = bio_data_dir(bio); + const int cpu = part_stat_lock(); + + (void)cpu; + part_stat_add(cpu, &input->disk->part0, ticks[rw], duration); + part_round_stats(cpu, &input->disk->part0); + part_dec_in_flight(&input->disk->part0, rw); + part_stat_unlock(); +} + +#else /* part_stat_lock */ +#define _if_start_io_acct(...) do {} while (0) +#define _if_end_io_acct(...) do {} while (0) +#endif + +/* callback + */ +static +void if_endio(struct generic_callback *cb) +{ + struct if_aio_aspect *aio_a = cb->cb_private; + struct if_input *input; + int k; + int rw; + int error; + + LAST_CALLBACK(cb); + if (unlikely(!aio_a || !aio_a->object)) { + XIO_FAT("aio_a = %p aio = %p, something is very wrong here!\n", aio_a, aio_a->object); + goto out_return; + } + input = aio_a->input; + CHECK_PTR(input, err); + + rw = aio_a->object->io_rw; + + for (k = 0; k < aio_a->bio_count; k++) { + struct bio_wrapper *biow; + struct bio *bio; + + biow = aio_a->orig_biow[k]; + aio_a->orig_biow[k] = NULL; + CHECK_PTR(biow, err); + + CHECK_ATOMIC(&biow->bi_comp_cnt, 1); + if (!atomic_dec_and_test(&biow->bi_comp_cnt)) + continue; + + bio = biow->bio; + CHECK_PTR_NULL(bio, err); + + _if_end_io_acct(input, biow); + + error = CALLBACK_ERROR(aio_a->object); + if (unlikely(error < 0)) { + int bi_size = bio->bi_iter.bi_size; + + XIO_ERR("NYI: error=%d RETRY LOGIC %u\n", error, bi_size); + } else { /* bio conventions are slightly different... */ + error = 0; + bio->bi_iter.bi_size = 0; + } + bio_endio(bio, error); + bio_put(bio); + brick_mem_free(biow); + } + atomic_dec(&input->flying_count); + if (rw) + atomic_dec(&input->write_flying_count); + else + atomic_dec(&input->read_flying_count); + goto out_return; +err: + XIO_FAT("error in callback, giving up\n"); +out_return:; +} + +/* Kick off plugged aios + */ +static +void _if_unplug(struct if_input *input) +{ + /* struct if_brick *brick = input->brick; */ + LIST_HEAD(tmp_list); + +#ifdef CONFIG_MARS_DEBUG + might_sleep(); +#endif + + down(&input->kick_sem); + spin_lock(&input->req_lock); + if (!list_empty(&input->plug_anchor)) { + /* move over the whole list */ + list_replace_init(&input->plug_anchor, &tmp_list); + atomic_set(&input->plugged_count, 0); + } + spin_unlock(&input->req_lock); + up(&input->kick_sem); + + while (!list_empty(&tmp_list)) { + struct if_aio_aspect *aio_a; + struct aio_object *aio; + int hash_index; + + aio_a = container_of(tmp_list.next, struct if_aio_aspect, plug_head); + list_del_init(&aio_a->plug_head); + + hash_index = aio_a->hash_index; + spin_lock(&input->hash_table[hash_index].hash_lock); + list_del_init(&aio_a->hash_head); + spin_unlock(&input->hash_table[hash_index].hash_lock); + + aio = aio_a->object; + + if (unlikely(aio_a->current_len > aio_a->max_len)) + XIO_ERR("request len %d > %d\n", aio_a->current_len, aio_a->max_len); + aio->io_len = aio_a->current_len; + + atomic_inc(&input->flying_count); + atomic_inc(&input->total_fire_count); + if (aio->io_rw) + atomic_inc(&input->write_flying_count); + else + atomic_inc(&input->read_flying_count); + if (aio->io_skip_sync) + atomic_inc(&input->total_skip_sync_count); + + GENERIC_INPUT_CALL(input, aio_io, aio); + GENERIC_INPUT_CALL(input, aio_put, aio); + } +} + +/* accept a linux bio, convert to aio and call buf_io() on it. + */ +static +#ifdef BIO_CPU_AFFINE +int +#else +void +#endif +if_make_request(struct request_queue *q, struct bio *bio) +{ + struct if_input *input = q->queuedata; + struct if_brick *brick = input->brick; + + /* Original flags of the source bio + */ + const int rw = bio_data_dir(bio); + const int sectors = bio_sectors(bio); + +/* adapt to different kernel versions (TBD: improve) */ +#if defined(BIO_RW_RQ_MASK) || defined(BIO_FLUSH) + const bool ahead = bio_rw_flagged(bio, BIO_RW_AHEAD) && rw == READ; + const bool barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); + const bool syncio = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); + const bool meta = bio_rw_flagged(bio, BIO_RW_META); + const bool discard = bio_rw_flagged(bio, BIO_RW_DISCARD); + const bool noidle = bio_rw_flagged(bio, BIO_RW_NOIDLE); + +#elif defined(REQ_FLUSH) && defined(REQ_SYNC) +#define _flagged(x) (bio->bi_rw & (x)) + const bool ahead = _flagged(REQ_RAHEAD) && rw == READ; + const bool barrier = _flagged(REQ_FLUSH); + const bool syncio = _flagged(REQ_SYNC); + const bool unplug = false; + const bool meta = _flagged(REQ_META); + const bool discard = _flagged(REQ_DISCARD); + const bool noidle = _flagged(REQ_THROTTLED); + +#else +#error Cannot decode the bio flags +#endif + const int prio = bio_prio(bio); + + /* Transform into XIO flags + */ + const int io_prio = + (prio == IOPRIO_CLASS_RT || (meta | syncio)) ? + XIO_PRIO_HIGH : + (prio == IOPRIO_CLASS_IDLE) ? + XIO_PRIO_LOW : + XIO_PRIO_NORMAL; + const bool do_unplug = ALWAYS_UNPLUG | unplug | noidle; + const bool do_skip_sync = brick->skip_sync && !(barrier | syncio); + + struct bio_wrapper *biow; + struct aio_object *aio = NULL; + struct if_aio_aspect *aio_a; + + struct bio_vec bvec; + struct bvec_iter i; + + loff_t pos = ((loff_t)bio->bi_iter.bi_sector) << 9; /* TODO: make dynamic */ + int total_len = bio->bi_iter.bi_size; + + bool assigned = false; + int error = -ENOSYS; + + bind_to_channel(brick->say_channel, current); + + might_sleep(); + + if (unlikely(!sectors)) { + _if_unplug(input); + /* THINK: usually this happens only at write barriers. + * We have no "barrier" operation in XIO, since + * callback semantics should always denote + * "writethrough accomplished". + * In case of exceptional semantics, we need to do + * something here. For now, we do just nothing. + */ + bio_endio(bio, 0); + error = 0; + goto done; + } + + /* throttling of too big write requests */ + if (rw && if_throttle_start_size > 0) { + int kb = (total_len + 512) / 1024; + + if (kb >= if_throttle_start_size) + xio_limit_sleep(&if_throttle, kb); + } + + (void)ahead; /* shut up gcc */ + if (unlikely(discard)) { /* NYI */ + bio_endio(bio, 0); + error = 0; + goto done; + } + + biow = brick_mem_alloc(sizeof(struct bio_wrapper)); + biow->bio = bio; + atomic_set(&biow->bi_comp_cnt, 0); + + if (rw) + atomic_inc(&input->total_write_count); + else + atomic_inc(&input->total_read_count); + _if_start_io_acct(input, biow); + + /* Get a reference to the bio. + * Will be released after bio_endio(). + */ + atomic_inc(&bio->bi_cnt); + + /* FIXME: THIS IS PROVISIONARY (use event instead) + */ + while (unlikely(!brick->power.on_led)) + brick_msleep(100); + + down(&input->kick_sem); + + bio_for_each_segment(bvec, bio, i) { + struct page *page = bvec.bv_page; + int bv_len = bvec.bv_len; + int offset = bvec.bv_offset; + + void *data; + +#ifdef ARCH_HAS_KMAP +#error FIXME: the current infrastructure cannot deal with HIGHMEM / kmap() +#endif + data = page_address(page); + error = -EINVAL; + if (unlikely(!data)) + break; + + data += offset; + + while (bv_len > 0) { + struct list_head *tmp; + int hash_index; + int this_len = 0; + + aio = NULL; + aio_a = NULL; + + hash_index = (pos / IF_HASH_CHUNK) % IF_HASH_MAX; + +#ifdef REQUEST_MERGING + spin_lock(&input->hash_table[hash_index].hash_lock); + for (tmp = input->hash_table[hash_index].hash_anchor.next; tmp != &input->hash_table[hash_index].hash_anchor; tmp = tmp->next) { + struct if_aio_aspect *tmp_a; + struct aio_object *tmp_aio; + int i; + + tmp_a = container_of(tmp, struct if_aio_aspect, hash_head); + tmp_aio = tmp_a->object; + if (tmp_a->orig_page != page || tmp_aio->io_rw != rw || tmp_a->bio_count >= MAX_BIO || tmp_a->current_len + bv_len > tmp_a->max_len) + continue; + + if (tmp_aio->io_data + tmp_a->current_len == data) + goto merge_end; + continue; + +merge_end: + tmp_a->current_len += bv_len; + aio = tmp_aio; + aio_a = tmp_a; + this_len = bv_len; + if (!do_skip_sync) + aio->io_skip_sync = false; + + for (i = 0; i < aio_a->bio_count; i++) { + if (aio_a->orig_biow[i]->bio == bio) + goto unlock; + } + + CHECK_ATOMIC(&biow->bi_comp_cnt, 0); + atomic_inc(&biow->bi_comp_cnt); + aio_a->orig_biow[aio_a->bio_count++] = biow; + assigned = true; + goto unlock; + } /* foreach hash collision list member */ + +unlock: + spin_unlock(&input->hash_table[hash_index].hash_lock); +#endif + if (!aio) { + int prefetch_len; + + error = -ENOMEM; + aio = if_alloc_aio(brick); + aio_a = if_aio_get_aspect(brick, aio); + if (unlikely(!aio_a)) { + up(&input->kick_sem); + goto err; + } + +#ifdef PREFETCH_LEN + prefetch_len = PREFETCH_LEN - offset; +/**/ + if (prefetch_len > total_len) + prefetch_len = total_len; + if (pos + prefetch_len > brick->dev_size) + prefetch_len = brick->dev_size - pos; + if (prefetch_len < bv_len) + prefetch_len = bv_len; +#else + prefetch_len = bv_len; +#endif + + SETUP_CALLBACK(aio, if_endio, aio_a); + + aio_a->input = input; + aio->io_rw = aio->io_may_write = rw; + aio->io_pos = pos; + aio->io_len = prefetch_len; + aio->io_data = data; /* direct IO */ + aio->io_prio = io_prio; + aio_a->orig_page = page; + + error = GENERIC_INPUT_CALL(input, aio_get, aio); + if (unlikely(error < 0)) { + up(&input->kick_sem); + goto err; + } + + this_len = aio->io_len; /* now may be shorter than originally requested. */ + aio_a->max_len = this_len; + if (this_len > bv_len) + this_len = bv_len; + aio_a->current_len = this_len; + if (rw) + atomic_inc(&input->total_aio_write_count); + else + atomic_inc(&input->total_aio_read_count); + CHECK_ATOMIC(&biow->bi_comp_cnt, 0); + atomic_inc(&biow->bi_comp_cnt); + aio_a->orig_biow[0] = biow; + aio_a->bio_count = 1; + assigned = true; + + /* When a bio with multiple biovecs is split into + * multiple aios, only the last one should be + * working in synchronous writethrough mode. + */ + aio->io_skip_sync = true; + if (!do_skip_sync && i.bi_idx + 1 >= bio->bi_iter.bi_idx) + aio->io_skip_sync = false; + + atomic_inc(&input->plugged_count); + + aio_a->hash_index = hash_index; + spin_lock(&input->hash_table[hash_index].hash_lock); + list_add_tail(&aio_a->hash_head, &input->hash_table[hash_index].hash_anchor); + spin_unlock(&input->hash_table[hash_index].hash_lock); + + spin_lock(&input->req_lock); + list_add_tail(&aio_a->plug_head, &input->plug_anchor); + spin_unlock(&input->req_lock); + } /* !aio */ + + pos += this_len; + data += this_len; + bv_len -= this_len; + total_len -= this_len; + } /* while bv_len > 0 */ + } /* foreach bvec */ + + up(&input->kick_sem); + + if (likely(!total_len)) + error = 0; + else + XIO_ERR("bad rest len = %d\n", total_len); +err: + if (error < 0) { + XIO_ERR("cannot submit request from bio, status=%d\n", error); + if (!assigned) + bio_endio(bio, error); + } + + if (do_unplug || + (brick && brick->max_plugged > 0 && atomic_read(&input->plugged_count) > brick->max_plugged)) { + _if_unplug(input); + } + +done: + remove_binding_from(brick->say_channel, current); + +#ifdef BIO_CPU_AFFINE + return error; +#else + goto out_return; +#endif +out_return:; +} + +#ifndef BLK_MAX_REQUEST_COUNT +/* static */ +void if_unplug(struct request_queue *q) +{ + struct if_input *input = q->queuedata; + + spin_lock_irq(q->queue_lock); + was_plugged = blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + _if_unplug(input); +} +#endif + +/* static */ +int xio_congested(void *data, int bdi_bits) +{ + struct if_input *input = data; + int ret = 0; + + if (bdi_bits & (1 << BDI_sync_congested) && + atomic_read(&input->read_flying_count) > 0) { + ret |= (1 << BDI_sync_congested); + } + if (bdi_bits & (1 << BDI_async_congested) && + atomic_read(&input->write_flying_count) > 0) { + ret |= (1 << BDI_async_congested); + } + return ret; +} + +static +int xio_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) +{ + unsigned int bio_size = bvm->bi_size; + + if (!bio_size) + return bvec->bv_len; + return 128; +} + +static +loff_t if_get_capacity(struct if_brick *brick) +{ + /* Don't read always, read only when unknown. + * brick->dev_size may be different from underlying sizes, + * e.g. when the size symlink indicates a logically smaller + * device than physically. + */ + if (brick->dev_size <= 0) { + struct xio_info info = {}; + struct if_input *input = brick->inputs[0]; + int status; + + status = GENERIC_INPUT_CALL(input, xio_get_info, &info); + if (unlikely(status < 0)) { + XIO_ERR("cannot get device info, status=%d\n", status); + return 0; + } + XIO_INF("determined default capacity: %lld bytes\n", info.current_size); + brick->dev_size = info.current_size; + } + return brick->dev_size; +} + +static +void if_set_capacity(struct if_input *input, loff_t capacity) +{ + CHECK_PTR(input->disk, done); + CHECK_PTR(input->disk->disk_name, done); + XIO_INF("new capacity of '%s': %lld bytes\n", input->disk->disk_name, capacity); + input->capacity = capacity; + set_capacity(input->disk, capacity >> 9); + if (likely(input->bdev && input->bdev->bd_inode)) + i_size_write(input->bdev->bd_inode, capacity); +done:; +} + +static const struct block_device_operations if_blkdev_ops; + +static int if_switch(struct if_brick *brick) +{ + struct if_input *input = brick->inputs[0]; + struct request_queue *q; + struct gendisk *disk; + int minor; + int status = 0; + + down(&brick->switch_sem); + + /* brick is in operation */ + if (brick->power.button && brick->power.on_led) { + loff_t capacity; + + capacity = if_get_capacity(brick); + if (capacity > 0 && capacity != input->capacity) { + XIO_INF("changing capacity from %lld to %lld\n", + (long long)input->capacity, + (long long)capacity); + if_set_capacity(input, capacity); + } + } + + /* brick should be switched on */ + if (brick->power.button && brick->power.off_led) { + loff_t capacity; + + xio_set_power_off_led((void *)brick, false); + brick->say_channel = get_binding(current); + + status = -ENOMEM; + q = blk_alloc_queue(GFP_BRICK); + if (!q) { + XIO_ERR("cannot allocate device request queue\n"); + goto is_down; + } + q->queuedata = input; + input->q = q; + + disk = alloc_disk(1); + if (!disk) { + XIO_ERR("cannot allocate gendisk\n"); + goto is_down; + } + + minor = device_minor++; /* TODO: protect against races (e.g. atomic_t) */ + set_disk_ro(disk, true); + + disk->queue = q; + disk->major = XIO_MAJOR; /* TODO: make this dynamic for >256 devices */ + disk->first_minor = minor; + disk->fops = &if_blkdev_ops; + snprintf(disk->disk_name, sizeof(disk->disk_name), "%s", brick->brick_name); + disk->private_data = input; + input->disk = disk; + capacity = if_get_capacity(brick); + XIO_DBG("created device name %s, capacity=%lld\n", disk->disk_name, capacity); + if_set_capacity(input, capacity); + + blk_queue_make_request(q, if_make_request); +#ifdef USE_MAX_SECTORS +#ifdef MAX_SEGMENT_SIZE + XIO_DBG("blk_queue_max_sectors()\n"); + blk_queue_max_sectors(q, USE_MAX_SECTORS); +#else + XIO_DBG("blk_queue_max_hw_sectors()\n"); + blk_queue_max_hw_sectors(q, USE_MAX_SECTORS); +#endif +#endif +#ifdef USE_MAX_PHYS_SEGMENTS +#ifdef MAX_SEGMENT_SIZE + XIO_DBG("blk_queue_max_phys_segments()\n"); + blk_queue_max_phys_segments(q, USE_MAX_PHYS_SEGMENTS); +#else + XIO_DBG("blk_queue_max_segments()\n"); + blk_queue_max_segments(q, USE_MAX_PHYS_SEGMENTS); +#endif +#endif +#ifdef USE_MAX_HW_SEGMENTS + XIO_DBG("blk_queue_max_hw_segments()\n"); + blk_queue_max_hw_segments(q, USE_MAX_HW_SEGMENTS); +#endif +#ifdef USE_MAX_SEGMENT_SIZE + XIO_DBG("blk_queue_max_segment_size()\n"); + blk_queue_max_segment_size(q, USE_MAX_SEGMENT_SIZE); +#endif +#ifdef USE_LOGICAL_BLOCK_SIZE + XIO_DBG("blk_queue_logical_block_size()\n"); + blk_queue_logical_block_size(q, USE_LOGICAL_BLOCK_SIZE); +#endif +#ifdef USE_SEGMENT_BOUNDARY + XIO_DBG("blk_queue_segment_boundary()\n"); + blk_queue_segment_boundary(q, USE_SEGMENT_BOUNDARY); +#endif +#ifdef QUEUE_ORDERED_DRAIN + XIO_DBG("blk_queue_ordered()\n"); + blk_queue_ordered(q, QUEUE_ORDERED_DRAIN, NULL); +#endif + XIO_DBG("blk_queue_bounce_limit()\n"); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); +#ifndef BLK_MAX_REQUEST_COUNT + XIO_DBG("unplug_fn\n"); + q->unplug_fn = if_unplug; +#endif + XIO_DBG("queue_lock\n"); + q->queue_lock = &input->req_lock; /* needed! */ + + input->bdev = bdget(MKDEV(disk->major, minor)); + /* we have no partitions. we contain only ourselves. */ + input->bdev->bd_contains = input->bdev; + +#ifdef USE_CONGESTED_FN + XIO_DBG("congested_fn\n"); + q->backing_dev_info.congested_fn = xio_congested; + q->backing_dev_info.congested_data = input; +#endif +#ifdef USE_MERGE_BVEC + XIO_DBG("blk_queue_merge_bvec()\n"); + blk_queue_merge_bvec(q, xio_merge_bvec); +#endif + + /* point of no return */ + XIO_DBG("add_disk()\n"); + add_disk(disk); + set_disk_ro(disk, false); + + /* report success */ + xio_set_power_on_led((void *)brick, true); + status = 0; + } + + /* brick should be switched off */ + if (!brick->power.button && !brick->power.off_led) { + int flying; + + xio_set_power_on_led((void *)brick, false); + disk = input->disk; + if (!disk) + goto is_down; + + if (atomic_read(&brick->open_count) > 0) { + XIO_INF("device '%s' is open %d times, cannot shutdown\n", + disk->disk_name, + atomic_read(&brick->open_count)); + status = -EBUSY; + goto done; /* don't indicate "off" status */ + } + flying = atomic_read(&input->flying_count); + if (flying > 0) { + XIO_INF("device '%s' has %d flying requests, cannot shutdown\n", disk->disk_name, flying); + status = -EBUSY; + goto done; /* don't indicate "off" status */ + } + if (input->bdev) { + XIO_DBG("calling bdput()\n"); + bdput(input->bdev); + input->bdev = NULL; + } + XIO_DBG("calling del_gendisk()\n"); + del_gendisk(input->disk); + XIO_DBG("calling put_disk()\n"); + put_disk(input->disk); + input->disk = NULL; + status = 0; +is_down: + xio_set_power_off_led((void *)brick, true); + } + +done: + up(&brick->switch_sem); + return status; +} + +/*************** interface to the outer world (kernel) **************/ + +static int if_open(struct block_device *bdev, fmode_t mode) +{ + struct if_input *input; + struct if_brick *brick; + + if (unlikely(!bdev || !bdev->bd_disk)) { + XIO_ERR("----------------------- INVAL ------------------------------\n"); + return -EINVAL; + } + + input = bdev->bd_disk->private_data; + + if (unlikely(!input || !input->brick)) { + XIO_ERR("----------------------- BAD IF SETUP ------------------------------\n"); + return -EINVAL; + } + brick = input->brick; + + down(&brick->switch_sem); + + if (unlikely(!brick->power.on_led)) { + XIO_INF("----------------------- BUSY %d ------------------------------\n", + atomic_read(&brick->open_count)); + up(&brick->switch_sem); + return -EBUSY; + } + + atomic_inc(&brick->open_count); + + XIO_INF("----------------------- OPEN %d ------------------------------\n", atomic_read(&brick->open_count)); + + up(&brick->switch_sem); + return 0; +} + +static +void +if_release(struct gendisk *gd, fmode_t mode) +{ + struct if_input *input = gd->private_data; + struct if_brick *brick = input->brick; + int nr; + + XIO_INF("----------------------- CLOSE %d ------------------------------\n", atomic_read(&brick->open_count)); + + if (atomic_dec_and_test(&brick->open_count)) { + while ((nr = atomic_read(&input->flying_count)) > 0) { + XIO_INF("%d IO requests not yet completed\n", nr); + brick_msleep(1000); + } + + XIO_DBG("status button=%d on_led=%d off_led=%d\n", + brick->power.button, + brick->power.on_led, + brick->power.off_led); + local_trigger(); + } +} + +static const struct block_device_operations if_blkdev_ops = { + .owner = THIS_MODULE, + .open = if_open, + .release = if_release, + +}; + +/*************** informational * statistics **************/ + +static +char *if_statistics(struct if_brick *brick, int verbose) +{ + struct if_input *input = brick->inputs[0]; + char *res = brick_string_alloc(512); + int tmp0 = atomic_read(&input->total_reada_count); + int tmp1 = atomic_read(&input->total_read_count); + int tmp2 = atomic_read(&input->total_aio_read_count); + int tmp3 = atomic_read(&input->total_write_count); + int tmp4 = atomic_read(&input->total_aio_write_count); + + snprintf(res, 512, + "total reada = %d reads = %d aio_reads = %d (%d%%) writes = %d aio_writes = %d (%d%%) empty = %d fired = %d skip_sync = %d | plugged = %d flying = %d (reads = %d writes = %d)\n", + tmp0, + tmp1, + tmp2, + tmp1 ? tmp2 * 100 / tmp1 : 0, + tmp3, + tmp4, + tmp3 ? tmp4 * 100 / tmp3 : 0, + atomic_read(&input->total_empty_count), + atomic_read(&input->total_fire_count), + atomic_read(&input->total_skip_sync_count), + atomic_read(&input->plugged_count), + atomic_read(&input->flying_count), + atomic_read(&input->read_flying_count), + atomic_read(&input->write_flying_count)); + return res; +} + +static +void if_reset_statistics(struct if_brick *brick) +{ + struct if_input *input = brick->inputs[0]; + + atomic_set(&input->total_read_count, 0); + atomic_set(&input->total_write_count, 0); + atomic_set(&input->total_empty_count, 0); + atomic_set(&input->total_fire_count, 0); + atomic_set(&input->total_skip_sync_count, 0); + atomic_set(&input->total_aio_read_count, 0); + atomic_set(&input->total_aio_write_count, 0); +} + +/***************** own brick * input * output operations *****************/ + +/* none */ + +/*************** object * aspect constructors * destructors **************/ + +static int if_aio_aspect_init_fn(struct generic_aspect *_ini) +{ + struct if_aio_aspect *ini = (void *)_ini; + + INIT_LIST_HEAD(&ini->plug_head); + INIT_LIST_HEAD(&ini->hash_head); + return 0; +} + +static void if_aio_aspect_exit_fn(struct generic_aspect *_ini) +{ + struct if_aio_aspect *ini = (void *)_ini; + + CHECK_HEAD_EMPTY(&ini->plug_head); + CHECK_HEAD_EMPTY(&ini->hash_head); +} + +XIO_MAKE_STATICS(if); + +/*********************** constructors * destructors ***********************/ + +static int if_brick_construct(struct if_brick *brick) +{ + sema_init(&brick->switch_sem, 1); + atomic_set(&brick->open_count, 0); + return 0; +} + +static int if_brick_destruct(struct if_brick *brick) +{ + return 0; +} + +static int if_input_construct(struct if_input *input) +{ + int i; + + input->hash_table = brick_block_alloc(0, PAGE_SIZE); + for (i = 0; i < IF_HASH_MAX; i++) { + spin_lock_init(&input->hash_table[i].hash_lock); + INIT_LIST_HEAD(&input->hash_table[i].hash_anchor); + } + INIT_LIST_HEAD(&input->plug_anchor); + sema_init(&input->kick_sem, 1); + spin_lock_init(&input->req_lock); + atomic_set(&input->flying_count, 0); + atomic_set(&input->read_flying_count, 0); + atomic_set(&input->write_flying_count, 0); + atomic_set(&input->plugged_count, 0); + return 0; +} + +static int if_input_destruct(struct if_input *input) +{ + int i; + + for (i = 0; i < IF_HASH_MAX; i++) + CHECK_HEAD_EMPTY(&input->hash_table[i].hash_anchor); + CHECK_HEAD_EMPTY(&input->plug_anchor); + brick_block_free(input->hash_table, PAGE_SIZE); + return 0; +} + +static int if_output_construct(struct if_output *output) +{ + return 0; +} + +/************************ static structs ***********************/ + +static struct if_brick_ops if_brick_ops = { + .brick_switch = if_switch, + .brick_statistics = if_statistics, + .reset_statistics = if_reset_statistics, +}; + +static struct if_output_ops if_output_ops; + +const struct if_input_type if_input_type = { + .type_name = "if_input", + .input_size = sizeof(struct if_input), + .input_construct = &if_input_construct, + .input_destruct = &if_input_destruct, +}; + +static const struct if_input_type *if_input_types[] = { + &if_input_type, +}; + +const struct if_output_type if_output_type = { + .type_name = "if_output", + .output_size = sizeof(struct if_output), + .master_ops = &if_output_ops, + .output_construct = &if_output_construct, +}; + +static const struct if_output_type *if_output_types[] = { + &if_output_type, +}; + +const struct if_brick_type if_brick_type = { + .type_name = "if_brick", + .brick_size = sizeof(struct if_brick), + .max_inputs = 1, + .max_outputs = 0, + .master_ops = &if_brick_ops, + .aspect_types = if_aspect_types, + .default_input_types = if_input_types, + .default_output_types = if_output_types, + .brick_construct = &if_brick_construct, + .brick_destruct = &if_brick_destruct, +}; +EXPORT_SYMBOL_GPL(if_brick_type); + +/***************** module init stuff ************************/ + +void exit_xio_if(void) +{ + int status; + + XIO_INF("exit_if()\n"); + status = if_unregister_brick_type(); + unregister_blkdev(XIO_MAJOR, "xio"); +} + +int __init init_xio_if(void) +{ + int status; + + (void)if_aspect_types; /* not used, shut up gcc */ + + XIO_INF("init_if()\n"); + status = register_blkdev(XIO_MAJOR, "xio"); + if (status) + return status; + status = if_register_brick_type(); + if (status) + goto err_device; + return status; +err_device: + XIO_ERR("init_if() status=%d\n", status); + exit_xio_if(); + return status; +} -- 2.0.0 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/