Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757520AbZAMXLg (ORCPT ); Tue, 13 Jan 2009 18:11:36 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1756249AbZAMXKi (ORCPT ); Tue, 13 Jan 2009 18:10:38 -0500 Received: from relanium.yandex.ru ([77.88.58.132]:6941 "EHLO relanium.yandex.ru" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755284AbZAMXKe (ORCPT ); Tue, 13 Jan 2009 18:10:34 -0500 X-Greylist: delayed 349 seconds by postgrey-1.27 at vger.kernel.org; Tue, 13 Jan 2009 18:10:26 EST From: Evgeniy Polyakov To: Greg KH Cc: linux-kernel@vger.kernel.org, dst@ioremap.net, Evgeniy Polyakov Subject: [5/7] dst: transactions. Date: Wed, 14 Jan 2009 02:05:31 +0300 Message-Id: <1231887933-17843-6-git-send-email-zbr@ioremap.net> X-Mailer: git-send-email 1.5.6.3 In-Reply-To: <1231887933-17843-5-git-send-email-zbr@ioremap.net> References: <1231887933-17843-1-git-send-email-zbr@ioremap.net> <1231887933-17843-2-git-send-email-zbr@ioremap.net> <1231887933-17843-3-git-send-email-zbr@ioremap.net> <1231887933-17843-4-git-send-email-zbr@ioremap.net> <1231887933-17843-5-git-send-email-zbr@ioremap.net> X-Antivirus: Dr.Web (R) for Mail Servers on relanium.yandex.ru host X-Antivirus-Code: 100000 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9931 Lines: 370 DST uses transaction model, when each store has to be explicitly acked from the remote node to be considered as successfully written. There may be lots of in-flight transactions. When remote host does not ack the transaction it will be resent predefined number of times with specified timeouts between them. All those parameters are configurable. Transactions are marked as failed after all resends completed unsuccessfully, having long enough resend timeout and/or large number of resends allows not to return error to the higher (FS usually) layer in case of short network problems or remote node outages. In case of network RAID setup this means that storage will not degrade until transactions are marked as failed, and thus will not force checksum recalculation and data rebuild. In case of connection failure DST will try to reconnect to the remote node automatically. DST sends ping commands at idle time to detect if remote node is alive. Because of transactional model it is possible to use zero-copy sending without worry of data corruption (which in turn could be detected by the strong checksums though). Transactions are handled in this patch: allocation/freeing/completion, scanning for stall and to-be-resent transactions and overall management of the storing tree. Signed-off-by: Evgeniy Polyakov diff --git a/drivers/staging/dst/trans.c b/drivers/staging/dst/trans.c new file mode 100644 index 0000000..557d372 --- /dev/null +++ b/drivers/staging/dst/trans.c @@ -0,0 +1,335 @@ +/* + * 2007+ Copyright (c) Evgeniy Polyakov + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +/* + * Transaction memory pool size. + */ +static int dst_mempool_num = 32; +module_param(dst_mempool_num, int, 0644); + +/* + * Transaction tree management. + */ +static inline int dst_trans_cmp(dst_gen_t gen, dst_gen_t new) +{ + if (gen < new) + return 1; + if (gen > new) + return -1; + return 0; +} + +struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen) +{ + struct rb_root *root = &node->trans_root; + struct rb_node *n = root->rb_node; + struct dst_trans *t, *ret = NULL; + int cmp; + + while (n) { + t = rb_entry(n, struct dst_trans, trans_entry); + + cmp = dst_trans_cmp(t->gen, gen); + if (cmp < 0) + n = n->rb_left; + else if (cmp > 0) + n = n->rb_right; + else { + ret = t; + break; + } + } + + dprintk("%s: %s transaction: id: %llu.\n", __func__, + (ret)?"found":"not found", gen); + + return ret; +} + +static int dst_trans_insert(struct dst_trans *new) +{ + struct rb_root *root = &new->n->trans_root; + struct rb_node **n = &root->rb_node, *parent = NULL; + struct dst_trans *ret = NULL, *t; + int cmp; + + while (*n) { + parent = *n; + + t = rb_entry(parent, struct dst_trans, trans_entry); + + cmp = dst_trans_cmp(t->gen, new->gen); + if (cmp < 0) + n = &parent->rb_left; + else if (cmp > 0) + n = &parent->rb_right; + else { + ret = t; + break; + } + } + + new->send_time = jiffies; + if (ret) { + printk("%s: exist: old: gen: %llu, bio: %llu/%u, send_time: %lu, " + "new: gen: %llu, bio: %llu/%u, send_time: %lu.\n", + __func__, + ret->gen, (u64)ret->bio->bi_sector, + ret->bio->bi_size, ret->send_time, + new->gen, (u64)new->bio->bi_sector, + new->bio->bi_size, new->send_time); + return -EEXIST; + } + + rb_link_node(&new->trans_entry, parent, n); + rb_insert_color(&new->trans_entry, root); + + dprintk("%s: inserted: gen: %llu, bio: %llu/%u, send_time: %lu.\n", + __func__, new->gen, (u64)new->bio->bi_sector, + new->bio->bi_size, new->send_time); + + return 0; +} + +int dst_trans_remove_nolock(struct dst_trans *t) +{ + struct dst_node *n = t->n; + + if (t->trans_entry.rb_parent_color) { + rb_erase(&t->trans_entry, &n->trans_root); + t->trans_entry.rb_parent_color = 0; + } + return 0; +} + +int dst_trans_remove(struct dst_trans *t) +{ + int ret; + struct dst_node *n = t->n; + + mutex_lock(&n->trans_lock); + ret = dst_trans_remove_nolock(t); + mutex_unlock(&n->trans_lock); + + return ret; +} + +/* + * When transaction is completed and there are no more users, + * we complete appriate block IO request with given error status. + */ +void dst_trans_put(struct dst_trans *t) +{ + if (atomic_dec_and_test(&t->refcnt)) { + struct bio *bio = t->bio; + + dprintk("%s: completed t: %p, gen: %llu, bio: %p.\n", + __func__, t, t->gen, bio); + + bio_endio(bio, t->error); + bio_put(bio); + + dst_node_put(t->n); + mempool_free(t, t->n->trans_pool); + } +} + +/* + * Process given block IO request: allocate transaction, insert it into the tree + * and send/schedule crypto processing. + */ +int dst_process_bio(struct dst_node *n, struct bio *bio) +{ + struct dst_trans *t; + int err = -ENOMEM; + + t = mempool_alloc(n->trans_pool, GFP_NOFS); + if (!t) + goto err_out_exit; + + t->n = dst_node_get(n); + t->bio = bio; + t->error = 0; + t->retries = 0; + atomic_set(&t->refcnt, 1); + t->gen = atomic_long_inc_return(&n->gen); + + t->enc = bio_data_dir(bio); + dst_bio_to_cmd(bio, &t->cmd, DST_IO, t->gen); + + mutex_lock(&n->trans_lock); + err = dst_trans_insert(t); + mutex_unlock(&n->trans_lock); + if (err) + goto err_out_free; + + dprintk("%s: gen: %llu, bio: %llu/%u, dir/enc: %d, need_crypto: %d.\n", + __func__, t->gen, (u64)bio->bi_sector, + bio->bi_size, t->enc, dst_need_crypto(n)); + + if (dst_need_crypto(n) && t->enc) + dst_trans_crypto(t); + else + dst_trans_send(t); + + return 0; + +err_out_free: + dst_node_put(n); + mempool_free(t, n->trans_pool); +err_out_exit: + bio_endio(bio, err); + bio_put(bio); + return err; +} + +/* + * Scan for timeout/stale transactions. + * Each transaction is being resent multiple times before error completion. + */ +static void dst_trans_scan(struct work_struct *work) +{ + struct dst_node *n = container_of(work, struct dst_node, trans_work.work); + struct rb_node *rb_node; + struct dst_trans *t; + unsigned long timeout = n->trans_scan_timeout; + int num = 10 * n->trans_max_retries; + + mutex_lock(&n->trans_lock); + + for (rb_node = rb_first(&n->trans_root); rb_node; ) { + t = rb_entry(rb_node, struct dst_trans, trans_entry); + + if (timeout && time_after(t->send_time + timeout, jiffies) + && t->retries == 0) + break; +#if 0 + dprintk("%s: t: %p, gen: %llu, n: %s, retries: %u, max: %u.\n", + __func__, t, t->gen, n->name, + t->retries, n->trans_max_retries); +#endif + if (--num == 0) + break; + + dst_trans_get(t); + + rb_node = rb_next(rb_node); + + if (timeout && (++t->retries < n->trans_max_retries)) { + dst_trans_send(t); + } else { + t->error = -ETIMEDOUT; + dst_trans_remove_nolock(t); + dst_trans_put(t); + } + + dst_trans_put(t); + } + + mutex_unlock(&n->trans_lock); + + /* + * If no timeout specified then system is in the middle of exiting process, + * so no need to reschedule scanning process again. + */ + if (timeout) { + if (!num) + timeout = HZ; + schedule_delayed_work(&n->trans_work, timeout); + } +} + +/* + * Flush all transactions and mark them as timed out. + * Destroy transaction pools. + */ +void dst_node_trans_exit(struct dst_node *n) +{ + struct dst_trans *t; + struct rb_node *rb_node; + + if (!n->trans_cache) + return; + + dprintk("%s: n: %p, cancelling the work.\n", __func__, n); + cancel_delayed_work_sync(&n->trans_work); + flush_scheduled_work(); + dprintk("%s: n: %p, work has been cancelled.\n", __func__, n); + + for (rb_node = rb_first(&n->trans_root); rb_node; ) { + t = rb_entry(rb_node, struct dst_trans, trans_entry); + + dprintk("%s: t: %p, gen: %llu, n: %s.\n", + __func__, t, t->gen, n->name); + + rb_node = rb_next(rb_node); + + t->error = -ETIMEDOUT; + dst_trans_remove_nolock(t); + dst_trans_put(t); + } + + mempool_destroy(n->trans_pool); + kmem_cache_destroy(n->trans_cache); +} + +/* + * Initialize transaction storage for given node. + * Transaction stores not only control information, + * but also network command and crypto data (if needed) + * to reduce number of allocations. Thus transaction size + * differs from node to node. + */ +int dst_node_trans_init(struct dst_node *n, unsigned int size) +{ + /* + * We need this, since node with given name can be dropped from the + * hash table, but be still alive, so subsequent creation of the node + * with the same name may collide with existing cache name. + */ + + snprintf(n->cache_name, sizeof(n->cache_name), "%s-%p", n->name, n); + + n->trans_cache = kmem_cache_create(n->cache_name, + size + n->crypto.crypto_attached_size, + 0, 0, NULL); + if (!n->trans_cache) + goto err_out_exit; + + n->trans_pool = mempool_create_slab_pool(dst_mempool_num, n->trans_cache); + if (!n->trans_pool) + goto err_out_cache_destroy; + + mutex_init(&n->trans_lock); + n->trans_root = RB_ROOT; + + INIT_DELAYED_WORK(&n->trans_work, dst_trans_scan); + schedule_delayed_work(&n->trans_work, n->trans_scan_timeout); + + dprintk("%s: n: %p, size: %u, crypto: %u.\n", + __func__, n, size, n->crypto.crypto_attached_size); + + return 0; + +err_out_cache_destroy: + kmem_cache_destroy(n->trans_cache); +err_out_exit: + return -ENOMEM; +} -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/