The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
Creates lockspaces which give applications separate contexts/namespaces in
which to do their locking. Manages locks on resources' grant/convert/wait
queues. Sends and receives high level locking operations between nodes.
Delivers completion and blocking callbacks (ast's) to lock holders.
Manages the distributed directory that tracks the current master node for
each resource.
Signed-Off-By: Dave Teigland <[email protected]>
Signed-Off-By: Patrick Caulfield <[email protected]>
---
drivers/dlm/lock.c | 3546 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 3546 insertions(+)
--- a/drivers/dlm/lock.c 1970-01-01 07:30:00.000000000 +0730
+++ b/drivers/dlm/lock.c 2005-04-25 22:52:03.924821624 +0800
@@ -0,0 +1,3546 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+
+/* Central locking logic has four stages:
+
+ dlm_lock()
+ dlm_unlock()
+
+ request_lock(ls, lkb)
+ convert_lock(ls, lkb)
+ unlock_lock(ls, lkb)
+ cancel_lock(ls, lkb)
+
+ _request_lock(r, lkb)
+ _convert_lock(r, lkb)
+ _unlock_lock(r, lkb)
+ _cancel_lock(r, lkb)
+
+ do_request(r, lkb)
+ do_convert(r, lkb)
+ do_unlock(r, lkb)
+ do_cancel(r, lkb)
+
+
+ Stage 1 (lock, unlock) is mainly about checking input args and
+ splitting into one of the four main operations:
+
+ dlm_lock = request_lock
+ dlm_lock+CONVERT = convert_lock
+ dlm_unlock = unlock_lock
+ dlm_unlock+CANCEL = cancel_lock
+
+ Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+ provided to the next stage.
+
+ Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+ When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+ Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
+ given rsb and lkb and queues callbacks.
+
+
+ For remote operations, the send_xxxx() results in the corresponding
+ do_xxxx() function being executed on the remote node. The connecting
+ send/receive calls on local (L) and remote (R) nodes:
+
+ L: send_xxxx() -> R: receive_xxxx()
+ R: do_xxxx()
+ L: receive_xxxx_reply() <- R: send_xxxx_reply()
+*/
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+ int len, struct dlm_args *args);
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args);
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args);
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args);
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
+ */
+
+const int __dlm_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+#define modes_compat(gr, rq) \
+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+ return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+const int __quecvt_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+ printk("lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+ " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+ lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+ lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+ lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+ printk("rsb: nodeid %d flags %lx trial %x name %s\n",
+ r->res_nodeid, r->res_flags, r->res_trial_lkid, r->res_name);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static void lock_recovery(struct dlm_ls *ls)
+{
+ down_read(&ls->ls_in_recovery);
+}
+
+static void unlock_recovery(struct dlm_ls *ls)
+{
+ up_read(&ls->ls_in_recovery);
+}
+
+static int lock_recovery_try(struct dlm_ls *ls)
+{
+ return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static int can_be_queued(struct dlm_lkb *lkb)
+{
+ return (!(lkb->lkb_exflags & DLM_LKF_NOQUEUE));
+}
+
+static int force_blocking_asts(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static int is_demoted(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static int is_remote(struct dlm_rsb *r)
+{
+ DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+ return r->res_nodeid ? TRUE : FALSE;
+}
+
+static int is_master(struct dlm_rsb *r)
+{
+ return r->res_nodeid ? FALSE : TRUE;
+}
+
+int dlm_is_master(struct dlm_rsb *r)
+{
+ return r->res_nodeid ? FALSE : TRUE;
+}
+
+static int is_process_copy(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static int is_master_copy(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+ return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE;
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ if (is_master_copy(lkb))
+ return;
+
+ DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+ lkb->lkb_lksb->sb_status = rv;
+ lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+
+ dlm_add_ast(lkb, AST_COMP);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+ if (is_master_copy(lkb))
+ send_bast(r, lkb, rqmode);
+ else {
+ lkb->lkb_bastmode = rqmode;
+ dlm_add_ast(lkb, AST_BAST);
+ }
+}
+
+static int dir_remove(struct dlm_rsb *r)
+{
+ int to_nodeid = dlm_dir_nodeid(r);
+
+ if (to_nodeid != dlm_our_nodeid())
+ send_remove(r);
+ else
+ dlm_dir_remove_entry(r->res_ls, to_nodeid,
+ r->res_name, r->res_length);
+ return 0;
+}
+
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+ struct dlm_rsb *r;
+
+ r = allocate_rsb(ls, len);
+ if (!r)
+ return NULL;
+
+ r->res_ls = ls;
+ r->res_length = len;
+ memcpy(r->res_name, name, len);
+ init_MUTEX(&r->res_sem);
+
+ INIT_LIST_HEAD(&r->res_lookup);
+ INIT_LIST_HEAD(&r->res_grantqueue);
+ INIT_LIST_HEAD(&r->res_convertqueue);
+ INIT_LIST_HEAD(&r->res_waitqueue);
+ INIT_LIST_HEAD(&r->res_root_list);
+ INIT_LIST_HEAD(&r->res_recover_list);
+
+ return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error = 0;
+
+ list_for_each_entry(r, head, res_hashchain) {
+ if (len == r->res_length && !memcmp(name, r->res_name, len))
+ goto found;
+ }
+ return -ENOENT;
+
+ found:
+ if (r->res_nodeid && (flags & R_MASTER))
+ error = -ENOTBLK;
+ *r_ret = r;
+ return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+ if (!error) {
+ kref_get(&r->res_ref);
+ goto out;
+ }
+ error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ if (!error) {
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+ if (r->res_nodeid == -1) {
+ clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ clear_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
+ r->res_trial_lkid = 0;
+ } else if (r->res_nodeid > 0) {
+ clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ set_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
+ r->res_trial_lkid = 0;
+ } else {
+ DLM_ASSERT(r->res_nodeid == 0,
+ dlm_print_rsb(r););
+ DLM_ASSERT(!test_bit(RESFL_MASTER_WAIT, &r->res_flags),
+ dlm_print_rsb(r););
+ DLM_ASSERT(!test_bit(RESFL_MASTER_UNCERTAIN,
+ &r->res_flags),);
+ }
+ }
+ out:
+ *r_ret = r;
+ return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ int error;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ error = _search_rsb(ls, name, len, b, flags, r_ret);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused. Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list. When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r, *tmp;
+ uint32_t bucket;
+ int error = 0;
+
+ bucket = dlm_hash(name, namelen);
+ bucket &= (ls->ls_rsbtbl_size - 1);
+
+ error = search_rsb(ls, name, namelen, bucket, flags, &r);
+ if (!error)
+ goto out;
+
+ if (error == -ENOENT && !(flags & R_CREATE))
+ goto out;
+
+ /* the rsb was found but wasn't a master copy */
+ if (error == -ENOTBLK)
+ goto out;
+
+ error = -ENOMEM;
+ r = create_rsb(ls, name, namelen);
+ if (!r)
+ goto out;
+
+ r->res_bucket = bucket;
+ r->res_nodeid = -1;
+ kref_init(&r->res_ref);
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+ if (!error) {
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ free_rsb(r);
+ r = tmp;
+ goto out;
+ }
+ list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ error = 0;
+ out:
+ *r_ret = r;
+ return error;
+}
+
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the rsb, so there's no need for locking. */
+
+static void hold_rsb(struct dlm_rsb *r)
+{
+ kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+ hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+ struct dlm_ls *ls = r->res_ls;
+
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+ kref_init(&r->res_ref);
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+ r->res_toss_time = jiffies;
+ if (r->res_lvbptr) {
+ free_lvb(r->res_lvbptr);
+ r->res_lvbptr = NULL;
+ }
+}
+
+/* When all references to the rsb are gone it's transfered to
+ the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ uint32_t bucket = r->res_bucket;
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ kref_put(&r->res_ref, toss_rsb);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+ put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+ int rv;
+ rv = kref_put(&r->res_ref, toss_rsb);
+ DLM_ASSERT(!rv, dlm_print_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the remove and free. */
+
+ DLM_ASSERT(list_empty(&r->res_lookup),);
+ DLM_ASSERT(list_empty(&r->res_grantqueue),);
+ DLM_ASSERT(list_empty(&r->res_convertqueue),);
+ DLM_ASSERT(list_empty(&r->res_waitqueue),);
+ DLM_ASSERT(list_empty(&r->res_root_list),);
+ DLM_ASSERT(list_empty(&r->res_recover_list),);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+ found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+ struct dlm_rsb *r;
+ int count = 0, found;
+
+ for (;;) {
+ found = FALSE;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+ res_hashchain) {
+ if (!time_after_eq(jiffies, r->res_toss_time +
+ DLM_TOSS_SECS * HZ))
+ continue;
+ found = TRUE;
+ break;
+ }
+
+ if (!found) {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ break;
+ }
+
+ if (kref_put(&r->res_ref, kill_rsb)) {
+ list_del(&r->res_hashchain);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+
+ if (is_master(r))
+ dir_remove(r);
+ free_rsb(r);
+ count++;
+ } else {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ log_error(ls, "tossed rsb in use %s", r->res_name);
+ }
+ }
+
+ return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+ int i, count = 0;
+
+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
+ return;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ count += shrink_bucket(ls, i);
+ cond_resched();
+ }
+}
+
+/* exclusive access to rsb and all its locks */
+
+static void lock_rsb(struct dlm_rsb *r)
+{
+ down(&r->res_sem);
+}
+
+static void unlock_rsb(struct dlm_rsb *r)
+{
+ up(&r->res_sem);
+}
+
+void dlm_lock_rsb(struct dlm_rsb *r)
+{
+ lock_rsb(r);
+}
+
+void dlm_unlock_rsb(struct dlm_rsb *r)
+{
+ unlock_rsb(r);
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+ The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ hold_rsb(r);
+ lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_resource) {
+ put_rsb(lkb->lkb_resource);
+ lkb->lkb_resource = NULL;
+ }
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ uint32_t lkid;
+ uint16_t bucket;
+
+ lkb = allocate_lkb(ls);
+ if (!lkb)
+ return -ENOMEM;
+
+ lkb->lkb_nodeid = -1;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ kref_init(&lkb->lkb_ref);
+
+ get_random_bytes(&bucket, sizeof(bucket));
+ bucket &= (ls->ls_lkbtbl_size - 1);
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+ lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+ /* FIXME: do a find to verify lkid not in use */
+
+ DLM_ASSERT(lkid, );
+
+ lkb->lkb_id = lkid;
+ list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+ uint16_t bucket = lkid & 0xFFFF;
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+ if (lkb->lkb_id == lkid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ uint16_t bucket = lkid & 0xFFFF;
+
+ if (bucket >= ls->ls_lkbtbl_size)
+ return -EBADSLT;
+
+ read_lock(&ls->ls_lkbtbl[bucket].lock);
+ lkb = __find_lkb(ls, lkid);
+ if (lkb)
+ kref_get(&lkb->lkb_ref);
+ read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the detach_lkb */
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+static int put_lkb(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+ if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+ list_del(&lkb->lkb_idtbl_list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ detach_lkb(lkb);
+
+ /* for local/process lkbs, lvbptr points to caller's lksb */
+ if (lkb->lkb_lvbptr && is_master_copy(lkb))
+ free_lvb(lkb->lkb_lvbptr);
+ if (lkb->lkb_range)
+ free_range(lkb->lkb_range);
+ free_lkb(lkb);
+ return 1;
+ } else {
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+ return 0;
+ }
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+ return put_lkb(lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the lkb, so there's no need for locking. */
+
+static void hold_lkb(struct dlm_lkb *lkb)
+{
+ kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+ it's not the last ref. e.g. del_lkb is always called between a
+ find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+ put_lkb would work fine, but would involve unnecessary locking */
+
+static void unhold_lkb(struct dlm_lkb *lkb)
+{
+ int rv;
+ rv = kref_put(&lkb->lkb_ref, kill_lkb);
+ DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+ int mode)
+{
+ struct dlm_lkb *lkb = NULL;
+
+ list_for_each_entry(lkb, head, lkb_statequeue)
+ if (lkb->lkb_rqmode < mode)
+ break;
+
+ if (!lkb)
+ list_add_tail(new, head);
+ else
+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+ kref_get(&lkb->lkb_ref);
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+ lkb->lkb_status = status;
+
+ switch (status) {
+ case DLM_LKSTS_WAITING:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+ break;
+ case DLM_LKSTS_GRANTED:
+ /* convention says granted locks kept in order of grmode */
+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+ lkb->lkb_grmode);
+ break;
+ case DLM_LKSTS_CONVERT:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue,
+ &r->res_convertqueue);
+ break;
+ default:
+ DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+ }
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_status = 0;
+ list_del(&lkb->lkb_statequeue);
+ unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+ hold_lkb(lkb);
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, sts);
+ unhold_lkb(lkb);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+ a reply from a remote node */
+
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+ down(&ls->ls_waiters_sem);
+ if (lkb->lkb_wait_type) {
+ printk("add_to_waiters error %d", lkb->lkb_wait_type);
+ goto out;
+ }
+ lkb->lkb_wait_type = mstype;
+ kref_get(&lkb->lkb_ref);
+ list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+ up(&ls->ls_waiters_sem);
+}
+
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (!lkb->lkb_wait_type) {
+ printk("remove_from_waiters error");
+ error = -EINVAL;
+ goto out;
+ }
+ lkb->lkb_wait_type = 0;
+ list_del(&lkb->lkb_wait_reply);
+ unhold_lkb(lkb);
+ out:
+ return error;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error;
+
+ down(&ls->ls_waiters_sem);
+ error = _remove_from_waiters(lkb);
+ up(&ls->ls_waiters_sem);
+ return error;
+}
+
+int dlm_remove_from_waiters(struct dlm_lkb *lkb)
+{
+ return remove_from_waiters(lkb);
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, uint32_t parent_lkid, void *ast,
+ void *astarg, void *bast, struct dlm_range *range,
+ struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ /* check for invalid arg usage */
+
+ if (mode < 0 || mode > DLM_LOCK_EX)
+ goto out;
+
+ if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+ goto out;
+
+ if (flags & DLM_LKF_CANCEL)
+ goto out;
+
+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+ goto out;
+
+ if (!ast || !lksb)
+ goto out;
+
+ if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+ goto out;
+
+ /* parent/child locks not yet supported */
+ if (parent_lkid)
+ goto out;
+
+ if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+ goto out;
+
+ /* these args will be copied to the lkb in validate_lock_args,
+ it cannot be done now because when converting locks, fields in
+ an active lkb cannot be modified before locking the rsb */
+
+ args->flags = flags;
+ args->astaddr = ast;
+ args->astparam = (long) astarg;
+ args->bastaddr = bast;
+ args->mode = mode;
+ args->lksb = lksb;
+ args->range = range;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+ if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK))
+ return -EINVAL;
+
+ args->flags = flags;
+ args->astparam = (long) astarg;
+ return 0;
+}
+
+/*
+ * Two stage 1 varieties: dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+ int mode,
+ struct dlm_lksb *lksb,
+ uint32_t flags,
+ void *name,
+ unsigned int namelen,
+ uint32_t parent_lkid,
+ void (*ast) (void *astarg),
+ void *astarg,
+ void (*bast) (void *astarg, int mode),
+ struct dlm_range *range)
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error, convert = flags & DLM_LKF_CONVERT;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ if (convert)
+ error = find_lkb(ls, lksb->sb_lkid, &lkb);
+ else
+ error = create_lkb(ls, &lkb);
+
+ if (error)
+ goto out;
+
+ error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+ astarg, bast, range, &args);
+ if (error)
+ goto out_put;
+
+ if (convert)
+ error = convert_lock(ls, lkb, &args);
+ else
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ out_put:
+ if (convert || error)
+ put_lkb(lkb);
+ if (error == -EAGAIN)
+ error = 0;
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+ uint32_t lkid,
+ uint32_t flags,
+ struct dlm_lksb *lksb,
+ void *astarg)
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ error = set_unlock_args(flags, astarg, &args);
+ if (error)
+ goto out_put;
+
+ if (flags & DLM_LKF_CANCEL)
+ error = cancel_lock(ls, lkb, &args);
+ else
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+ error = 0;
+ out_put:
+ put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+ The purpose of this function is to set the nodeid field in the given
+ lkb using the nodeid field in the given rsb. If the rsb's nodeid is
+ known, it can just be copied to the lkb and the function will return
+ 0. If the rsb's nodeid is _not_ known, it needs to be looked up
+ before it can be copied to the lkb.
+
+ When the rsb nodeid is being looked up remotely, the initial lkb
+ causing the lookup is kept on the ls_waiters list waiting for the
+ lookup reply. Other lkb's waiting for the same rsb lookup are kept
+ on the rsb's res_lookup list until the master is verified.
+
+ After a remote lookup or when a tossed rsb is retrived that specifies
+ a remote master, that master value is uncertain -- it may have changed
+ by the time we send it a request. While it's uncertain, only one lkb
+ is allowed to go ahead and use the master value; that lkb is specified
+ by res_trial_lkid. Once the trial lkb is queued on the master node
+ we know the rsb master is correct and any other lkbs on res_lookup
+ can get the rsb nodeid and go ahead with their request.
+
+ Return values:
+ 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+ 1: the rsb master is not available and the lkb has been placed on
+ a wait queue
+ -EXXX: there was some error in processing
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+ if (test_and_clear_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags)) {
+ set_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ r->res_trial_lkid = lkb->lkb_id;
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ if (r->res_nodeid == 0) {
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ if (r->res_trial_lkid == lkb->lkb_id) {
+ DLM_ASSERT(lkb->lkb_id, dlm_print_lkb(lkb););
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ if (test_bit(RESFL_MASTER_WAIT, &r->res_flags)) {
+ list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+ return 1;
+ }
+
+ if (r->res_nodeid > 0) {
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ /* This is the first lkb requested on this rsb since the rsb
+ was created. We need to figure out who the rsb master is. */
+
+ DLM_ASSERT(r->res_nodeid == -1, );
+
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid != our_nodeid) {
+ set_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ send_lookup(r, lkb);
+ return 1;
+ }
+
+ for (;;) {
+ /* It's possible for dlm_scand to remove an old rsb for
+ this same resource from the toss list, us to create
+ a new one, look up the master locally, and find it
+ already exists just before dlm_scand does the
+ dir_remove() on the previous rsb. */
+
+ error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+ r->res_length, &ret_nodeid);
+ if (!error)
+ break;
+ log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+ schedule();
+ }
+
+ if (ret_nodeid == our_nodeid) {
+ r->res_nodeid = 0;
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ set_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ r->res_trial_lkid = lkb->lkb_id;
+ r->res_nodeid = ret_nodeid;
+ lkb->lkb_nodeid = ret_nodeid;
+ return 0;
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid
+
+ This is called when we get a request reply from a remote node
+ who we believe is the master. The return value (error) we got
+ back indicates whether it's really the master or not. If it
+ wasn't we need to start over and do another master lookup. If
+ it was and our lock was queued we know the master won't change.
+ If it was and our lock wasn't queued, we need to do another
+ trial with the next lkb.
+*/
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ if (!test_bit(RESFL_MASTER_WAIT, &r->res_flags))
+ return;
+
+ switch (error) {
+ case 0:
+ case -EINPROGRESS:
+ /* the remote master queued our request, or
+ the remote dir node told us we're the master */
+
+ clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ r->res_trial_lkid = 0;
+
+ list_for_each_entry_safe(lkb, safe, &r->res_lookup,
+ lkb_rsb_lookup) {
+ list_del(&lkb->lkb_rsb_lookup);
+ _request_lock(r, lkb);
+ schedule();
+ }
+ break;
+
+ case -EAGAIN:
+ /* the remote master didn't queue our NOQUEUE request;
+ do another trial with the next waiting lkb */
+
+ if (!list_empty(&r->res_lookup)) {
+ lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+ lkb_rsb_lookup);
+ list_del(&lkb->lkb_rsb_lookup);
+ r->res_trial_lkid = lkb->lkb_id;
+ _request_lock(r, lkb);
+ break;
+ }
+ /* fall through so the rsb looks new */
+
+ case -ENOENT:
+ case -ENOTBLK:
+ /* the remote master wasn't really the master, i.e. our
+ trial failed; so we start over with another lookup */
+
+ r->res_nodeid = -1;
+ r->res_trial_lkid = 0;
+ clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
+ break;
+
+ default:
+ log_error(r->res_ls, "confirm_master unknown error %d", error);
+ }
+}
+
+int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (args->flags & DLM_LKF_CONVERT) {
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+ }
+
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astaddr = args->astaddr;
+ lkb->lkb_astparam = args->astparam;
+ lkb->lkb_bastaddr = args->bastaddr;
+ lkb->lkb_rqmode = args->mode;
+ lkb->lkb_lksb = args->lksb;
+ lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+ lkb->lkb_ownpid = (int) current->pid;
+
+ rv = 0;
+ if (!args->range)
+ goto out;
+
+ if (!lkb->lkb_range) {
+ rv = -ENOMEM;
+ lkb->lkb_range = allocate_range(ls);
+ if (!lkb->lkb_range)
+ goto out;
+ /* This is needed for conversions that contain ranges
+ where the original lock didn't but it's harmless for
+ new locks too. */
+ lkb->lkb_range[GR_RANGE_START] = 0LL;
+ lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+ }
+
+ lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
+ lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
+ lkb->lkb_flags |= DLM_IFL_RANGE;
+ rv = 0;
+ out:
+ return rv;
+}
+
+int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_CANCEL &&
+ lkb->lkb_status == DLM_LKSTS_GRANTED)
+ goto out;
+
+ if (!(args->flags & DLM_LKF_CANCEL) &&
+ lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_wait_type)
+ goto out;
+
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astparam = args->astparam;
+ rv = 0;
+ out:
+ return rv;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+ int len, struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = find_rsb(ls, name, len, R_CREATE, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ error = _request_lock(r, lkb);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+ out:
+ return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = _convert_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _unlock_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _cancel_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ /* set_master: sets lkb nodeid from r */
+
+ error = set_master(r, lkb);
+ if (error < 0)
+ goto out;
+ if (error) {
+ error = 0;
+ goto out;
+ }
+
+ if (is_remote(r))
+ /* receive_request() calls do_request() on remote node */
+ error = send_request(r, lkb);
+ else
+ error = do_request(r, lkb);
+ out:
+ return error;
+}
+
+/* change some property of an existing lkb, e.g. mode, range */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_convert() calls do_convert() on remote node */
+ error = send_convert(r, lkb);
+ else
+ error = do_convert(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_unlock() calls call do_unlock() on remote node */
+ error = send_unlock(r, lkb);
+ else
+ error = do_unlock(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_cancel() calls do_cancel() on remote node */
+ error = send_cancel(r, lkb);
+ else
+ error = do_cancel(r, lkb);
+
+ return error;
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int b;
+
+ /* b=1 lvb returned to caller
+ b=0 lvb written to rsb or invalidated
+ b=-1 do nothing */
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+ if (b == 1) {
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(lkb->lkb_lvbptr, r->res_lvbptr, DLM_LVB_LEN);
+ lkb->lkb_lvbseq = r->res_lvbseq;
+
+ } else if (b == 0) {
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ set_bit(RESFL_VALNOTVALID, &r->res_flags);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+ r->res_lvbseq++;
+ lkb->lkb_lvbseq = r->res_lvbseq;
+ clear_bit(RESFL_VALNOTVALID, &r->res_flags);
+ }
+
+ if (test_bit(RESFL_VALNOTVALID, &r->res_flags))
+ lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode < DLM_LOCK_PW)
+ return;
+
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ set_bit(RESFL_VALNOTVALID, &r->res_flags);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
+ r->res_lvbseq++;
+ clear_bit(RESFL_VALNOTVALID, &r->res_flags);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int b;
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+ if (b == 1) {
+ memcpy(lkb->lkb_lvbptr, ms->m_lvb, DLM_LVB_LEN);
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+ }
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+ remove_lock -- used for unlock, removes lkb from granted
+ revert_lock -- used for cancel, moves lkb from convert to granted
+ grant_lock -- used for request and convert, adds lkb to granted or
+ moves lkb from convert or waiting to granted
+
+ Each of these is used for master or local copy lkb's. There is
+ also a _pc() variation used to make the corresponding change on
+ a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_unlock(r, lkb);
+ _remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ _remove_lock(r, lkb);
+}
+
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+
+ switch (lkb->lkb_status) {
+ case DLM_LKSTS_CONVERT:
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ break;
+ case DLM_LKSTS_WAITING:
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+ break;
+ default:
+ log_print("invalid status for revert %d", lkb->lkb_status);
+ }
+}
+
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ if (lkb->lkb_status)
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ else
+ add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ }
+
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+
+ if (lkb->lkb_range) {
+ lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
+ lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
+ }
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_lock(r, lkb);
+ _grant_lock(r, lkb);
+ lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ set_lvb_lock_pc(r, lkb, ms);
+ _grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+ be sent to the requesting node in addition to granting the lock if the
+ lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ grant_lock(r, lkb);
+ if (is_master_copy(lkb))
+ send_grant(r, lkb);
+ else
+ queue_cast(r, lkb, 0);
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+ struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+ lkb_statequeue);
+ if (lkb->lkb_id == first->lkb_id)
+ return TRUE;
+
+ return FALSE;
+}
+
+/*
+ * Return 1 if the locks' ranges overlap
+ * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
+ */
+
+static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
+{
+ if (!lkb1->lkb_range || !lkb2->lkb_range)
+ return TRUE;
+
+ if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
+ lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
+ return FALSE;
+
+ return TRUE;
+}
+
+/*
+ * Check if the given lkb conflicts with another lkb on the queue.
+ */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this;
+
+ list_for_each_entry(this, head, lkb_statequeue) {
+ if (this == lkb)
+ continue;
+ if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource. The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ * PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list. We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this, *first = NULL, *self = NULL;
+
+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+ if (!first)
+ first = this;
+ if (this == lkb) {
+ self = lkb;
+ continue;
+ }
+
+ if (!ranges_overlap(lkb, this))
+ continue;
+
+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+ return TRUE;
+ }
+
+ /* if lkb is on the convert queue and is preventing the first
+ from being granted, then there's deadlock and we demote lkb.
+ multiple converting locks may need to do this before the first
+ converting lock can be granted. */
+
+ if (self && self != first) {
+ if (!modes_compat(lkb, first) &&
+ !queue_conflict(&rsb->res_grantqueue, first))
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+ /*
+ * 6-10: Version 5.4 introduced an option to address the phenomenon of
+ * a new request for a NL mode lock being blocked.
+ *
+ * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+ * request, then it would be granted. In essence, the use of this flag
+ * tells the Lock Manager to expedite theis request by not considering
+ * what may be in the CONVERTING or WAITING queues... As of this
+ * writing, the EXPEDITE flag can be used only with new requests for NL
+ * mode locks. This flag is not valid for conversion requests.
+ *
+ * A shortcut. Earlier checks return an error if EXPEDITE is used in a
+ * conversion or used with a non-NL requested mode. We also know an
+ * EXPEDITE request is always granted immediately, so now must always
+ * be 1. The full condition to grant an expedite request: (now &&
+ * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+ * therefore be shortened to just checking the flag.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+ return TRUE;
+
+ /*
+ * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+ * added to the remaining conditions.
+ */
+
+ if (queue_conflict(&r->res_grantqueue, lkb))
+ goto out;
+
+ /*
+ * 6-3: By default, a conversion request is immediately granted if the
+ * requested mode is compatible with the modes of all other granted
+ * locks
+ */
+
+ if (queue_conflict(&r->res_convertqueue, lkb))
+ goto out;
+
+ /*
+ * 6-5: But the default algorithm for deciding whether to grant or
+ * queue conversion requests does not by itself guarantee that such
+ * requests are serviced on a "first come first serve" basis. This, in
+ * turn, can lead to a phenomenon known as "indefinate postponement".
+ *
+ * 6-7: This issue is dealt with by using the optional QUECVT flag with
+ * the system service employed to request a lock conversion. This flag
+ * forces certain conversion requests to be queued, even if they are
+ * compatible with the granted modes of other locks on the same
+ * resource. Thus, the use of this flag results in conversion requests
+ * being ordered on a "first come first servce" basis.
+ *
+ * DCT: This condition is all about new conversions being able to occur
+ * "in place" while the lock remains on the granted queue (assuming
+ * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
+ * doesn't _have_ to go onto the convert queue where it's processed in
+ * order. The "now" variable is necessary to distinguish converts
+ * being received and processed for the first time now, because once a
+ * convert is moved to the conversion queue the condition below applies
+ * requiring fifo granting.
+ */
+
+ if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+ return TRUE;
+
+ /*
+ * When using range locks the NOORDER flag is set to avoid the standard
+ * vms rules on grant order.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+ return TRUE;
+
+ /*
+ * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+ * granted until all other conversion requests ahead of it are granted
+ * and/or canceled.
+ */
+
+ if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+ return TRUE;
+
+ /*
+ * 6-4: By default, a new request is immediately granted only if all
+ * three of the following conditions are satisfied when the request is
+ * issued:
+ * - The queue of ungranted conversion requests for the resource is
+ * empty.
+ * - The queue of ungranted new requests for the resource is empty.
+ * - The mode of the new request is compatible with the most
+ * restrictive mode of all granted locks on the resource.
+ */
+
+ if (now && !conv && list_empty(&r->res_convertqueue) &&
+ list_empty(&r->res_waitqueue))
+ return TRUE;
+
+ /*
+ * 6-4: Once a lock request is in the queue of ungranted new requests,
+ * it cannot be granted until the queue of ungranted conversion
+ * requests is empty, all ungranted new requests ahead of it are
+ * granted and/or canceled, and it is compatible with the granted mode
+ * of the most restrictive lock granted on the resource.
+ */
+
+ if (!now && !conv && list_empty(&r->res_convertqueue) &&
+ first_in_list(lkb, &r->res_waitqueue))
+ return TRUE;
+
+ out:
+ /*
+ * The following, enabled by CONVDEADLK, departs from VMS.
+ */
+
+ if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+ conversion_deadlock_detect(r, lkb)) {
+ lkb->lkb_grmode = DLM_LOCK_NL;
+ lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+ }
+
+ return FALSE;
+}
+
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ uint32_t flags = lkb->lkb_exflags;
+ int rv;
+ int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ goto out;
+
+ if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+ goto out;
+
+ if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+ alt = DLM_LOCK_PR;
+ else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+ alt = DLM_LOCK_CW;
+
+ if (alt) {
+ lkb->lkb_rqmode = alt;
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+ else
+ lkb->lkb_rqmode = rqmode;
+ }
+ out:
+ return rv;
+}
+
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+ int hi, demoted, quit, grant_restart, demote_restart;
+
+ quit = 0;
+ restart:
+ grant_restart = 0;
+ demote_restart = 0;
+ hi = DLM_LOCK_IV;
+
+ list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+ demoted = is_demoted(lkb);
+ if (can_be_granted(r, lkb, FALSE)) {
+ grant_lock_pending(r, lkb);
+ grant_restart = 1;
+ } else {
+ hi = MAX(lkb->lkb_rqmode, hi);
+ if (!demoted && is_demoted(lkb))
+ demote_restart = 1;
+ }
+ }
+
+ if (grant_restart)
+ goto restart;
+ if (demote_restart && !quit) {
+ quit = 1;
+ goto restart;
+ }
+
+ return MAX(high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+
+ list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+ if (can_be_granted(r, lkb, FALSE))
+ grant_lock_pending(r, lkb);
+ else
+ high = MAX(lkb->lkb_rqmode, high);
+ }
+
+ return high;
+}
+
+static int grant_pending_locks(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *s;
+ int high = DLM_LOCK_IV;
+
+ DLM_ASSERT(is_master(r), dlm_print_rsb(r););
+
+ high = grant_pending_convert(r, high);
+ high = grant_pending_wait(r, high);
+
+ if (high == DLM_LOCK_IV)
+ return 0;
+
+ /*
+ * If there are locks left on the wait/convert queue then send blocking
+ * ASTs to granted locks based on the largest requested mode (high)
+ * found above. This can generate spurious blocking ASTs for range
+ * locks. FIXME: highbast < high comparison not valid for PR/CW.
+ */
+
+ list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+ queue_bast(r, lkb, high);
+ lkb->lkb_highbast = high;
+ }
+ }
+
+ return 0;
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+ struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *gr;
+
+ list_for_each_entry(gr, head, lkb_statequeue) {
+ if (gr->lkb_bastaddr &&
+ gr->lkb_highbast < lkb->lkb_rqmode &&
+ ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
+ queue_bast(r, gr, lkb->lkb_rqmode);
+ gr->lkb_highbast = lkb->lkb_rqmode;
+ }
+ }
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+ send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (can_be_granted(r, lkb, TRUE)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ error = -EINPROGRESS;
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ /* changing an existing lock may allow others to be granted */
+
+ if (can_be_granted(r, lkb, TRUE)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ grant_pending_locks(r);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ if (is_demoted(lkb))
+ grant_pending_locks(r);
+ error = -EINPROGRESS;
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ remove_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ grant_pending_locks(r);
+ return -DLM_EUNLOCK;
+}
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ grant_pending_locks(r);
+ return -DLM_ECANCEL;
+}
+
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request receive_request
+ * send_convert receive_convert
+ * send_unlock receive_unlock
+ * send_cancel receive_cancel
+ * send_grant receive_grant
+ * send_bast receive_bast
+ * send_lookup receive_lookup
+ * send_remove receive_remove
+ *
+ * send_common_reply
+ * receive_request_reply send_request_reply
+ * receive_convert_reply send_convert_reply
+ * receive_unlock_reply send_unlock_reply
+ * receive_cancel_reply send_cancel_reply
+ * receive_lookup_reply send_lookup_reply
+ */
+
+static int create_message(struct dlm_rsb *r, int to_nodeid, int mstype,
+ struct dlm_message **ms_ret, struct dlm_mhandle **mh_ret)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_message);
+
+ if (mstype == DLM_MSG_REQUEST ||
+ mstype == DLM_MSG_LOOKUP ||
+ mstype == DLM_MSG_REMOVE)
+ mb_len += r->res_length;
+
+ /* get_buffer gives us a message handle (mh) that we need to
+ pass into lowcomms_commit and a message buffer (mb) that we
+ write our data into */
+
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh)
+ return -ENOBUFS;
+
+ memset(mb, 0, mb_len);
+
+ ms = (struct dlm_message *) mb;
+
+ ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+ ms->m_header.h_nodeid = dlm_our_nodeid();
+ ms->m_header.h_length = mb_len;
+ ms->m_header.h_cmd = DLM_MSG;
+
+ ms->m_type = mstype;
+
+ *mh_ret = mh;
+ *ms_ret = ms;
+ return 0;
+}
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+ dlm_message_out(ms);
+ dlm_lowcomms_commit_buffer(mh);
+ return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ ms->m_nodeid = lkb->lkb_nodeid;
+ ms->m_pid = lkb->lkb_ownpid;
+ ms->m_lkid = lkb->lkb_id;
+ ms->m_remid = lkb->lkb_remid;
+ ms->m_exflags = lkb->lkb_exflags;
+ ms->m_sbflags = lkb->lkb_sbflags;
+ ms->m_flags = lkb->lkb_flags;
+ ms->m_lvbseq = lkb->lkb_lvbseq;
+ ms->m_status = lkb->lkb_status;
+ ms->m_grmode = lkb->lkb_grmode;
+ ms->m_rqmode = lkb->lkb_rqmode;
+
+ /* m_result and m_bastmode are set from function args,
+ not from lkb fields */
+
+ if (lkb->lkb_bastaddr)
+ ms->m_asts |= AST_BAST;
+ if (lkb->lkb_astaddr)
+ ms->m_asts |= AST_COMP;
+
+ if (lkb->lkb_range) {
+ ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
+ ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
+ }
+
+ if (lkb->lkb_lvbptr)
+ memcpy(ms->m_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
+
+ if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+ memcpy(ms->m_name, r->res_name, r->res_length);
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, mstype);
+
+ to_nodeid = r->res_nodeid;
+
+ error = create_message(r, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_CONVERT);
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+ MASTER_UNCERTAIN to force the next request on the rsb to confirm
+ that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = 0;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_bastmode = mode;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, DLM_MSG_LOOKUP);
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ if (error)
+ goto out;
+
+ memcpy(ms->m_name, r->res_name, r->res_length);
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int mstype, int rv)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = rv;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+ int ret_nodeid, int rv)
+{
+ struct dlm_rsb *r = &ls->ls_stub_rsb;
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int error, to_nodeid = ms_in->m_header.h_nodeid;
+
+ error = create_message(r, to_nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ if (error)
+ goto out;
+
+ ms->m_lkid = ms_in->m_lkid;
+ ms->m_result = rv;
+ ms->m_nodeid = ret_nodeid;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+ of message, unlike the send side where we can safely send everything about
+ the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_exflags = ms->m_exflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_namelen(struct dlm_message *ms)
+{
+ return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (lkb->lkb_flags & DLM_IFL_RANGE) {
+ if (!lkb->lkb_range)
+ lkb->lkb_range = allocate_range(ls);
+ if (!lkb->lkb_range)
+ return -ENOMEM;
+ lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
+ lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
+ }
+ return 0;
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ if (!lkb->lkb_lvbptr)
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ memcpy(lkb->lkb_lvbptr, ms->m_lvb, DLM_LVB_LEN);
+ }
+ return 0;
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_ownpid = ms->m_pid;
+ lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+
+ DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
+ if (receive_range(ls, lkb, ms))
+ return -ENOMEM;
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+ log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+ lkb->lkb_nodeid, ms->m_header.h_nodeid,
+ lkb->lkb_id, lkb->lkb_remid);
+ return -EINVAL;
+ }
+
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ return -EBUSY;
+
+ if (receive_range(ls, lkb, ms))
+ return -ENOMEM;
+ if (lkb->lkb_range) {
+ lkb->lkb_range[GR_RANGE_START] = 0LL;
+ lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
+ }
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+
+ return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+ return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+ uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_remid = ms->m_lkid;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, namelen;
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ error = receive_request_args(ls, lkb, ms);
+ if (error) {
+ put_lkb(lkb);
+ goto fail;
+ }
+
+ namelen = receive_namelen(ms);
+
+ error = find_rsb(ls, ms->m_name, namelen, R_MASTER, &r);
+ if (error) {
+ put_lkb(lkb);
+ goto fail;
+ }
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ error = do_request(r, lkb);
+ send_request_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ if (error)
+ put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_convert_args(ls, lkb, ms);
+ if (error)
+ goto out;
+
+ error = do_convert(r, lkb);
+ out:
+ send_convert_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_unlock_args(ls, lkb, ms);
+ if (error)
+ goto out;
+
+ error = do_unlock(r, lkb);
+ out:
+ send_unlock_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = do_cancel(r, lkb);
+ send_cancel_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_grant no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_bast no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ queue_bast(r, lkb, ms->m_bastmode);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, error, ret_nodeid, dir_nodeid, from_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+
+ len = receive_namelen(ms);
+
+ dir_nodeid = dlm_dir_name2nodeid(ls, ms->m_name, len);
+ if (dir_nodeid != dlm_our_nodeid()) {
+ log_error(ls, "lookup dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ error = -EINVAL;
+ ret_nodeid = -1;
+ goto out;
+ }
+
+ error = dlm_dir_lookup(ls, from_nodeid, ms->m_name, len, &ret_nodeid);
+ out:
+ send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, dir_nodeid, from_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+
+ len = receive_namelen(ms);
+
+ dir_nodeid = dlm_dir_name2nodeid(ls, ms->m_name, len);
+ if (dir_nodeid != dlm_our_nodeid()) {
+ log_error(ls, "remove dir entry dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ return;
+ }
+
+ dlm_dir_remove_entry(ls, from_nodeid, ms->m_name, len);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned from do_request() on the master */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ switch (error) {
+ case -EAGAIN:
+ /* request would block (be queued) on remote master;
+ the unhold undoes the original ref from create_lkb()
+ so it leads to the lkb being freed */
+ queue_cast(r, lkb, -EAGAIN);
+ confirm_master(r, -EAGAIN);
+ unhold_lkb(lkb);
+ break;
+
+ case -EINPROGRESS:
+ case 0:
+ /* request was queued or granted on remote master */
+ receive_flags_reply(lkb, ms);
+ lkb->lkb_remid = ms->m_lkid;
+ if (error)
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ else {
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ }
+ confirm_master(r, error);
+ break;
+
+ case -ENOENT:
+ case -ENOTBLK:
+ /* find_rsb failed to find rsb or rsb wasn't master */
+
+ DLM_ASSERT(test_bit(RESFL_MASTER_WAIT, &r->res_flags),
+ log_print("receive_request_reply error %d", error);
+ dlm_print_lkb(lkb);
+ dlm_print_rsb(r););
+
+ confirm_master(r, error);
+ lkb->lkb_nodeid = -1;
+ _request_lock(r, lkb);
+ break;
+
+ default:
+ log_error(ls, "receive_request_reply unknown error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ put_lkb(lkb);
+}
+
+static void _receive_convert_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_convert() on the master */
+
+ switch (error) {
+ case -EAGAIN:
+ /* convert would block (be queued) on remote master */
+ queue_cast(r, lkb, -EAGAIN);
+ break;
+
+ case -EINPROGRESS:
+ /* convert was queued on remote master */
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ break;
+
+ case 0:
+ /* convert was granted on remote master */
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ break;
+
+ default:
+ log_error(ls, "receive_convert_reply unknown error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply not on waiters");
+ goto out;
+ }
+
+ _receive_convert_reply(ls, lkb, ms);
+ out:
+ put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_unlock() on the master */
+
+ switch (error) {
+ case -DLM_EUNLOCK:
+ receive_flags_reply(lkb, ms);
+ remove_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ break;
+ default:
+ log_error(ls, "receive_unlock_reply unknown error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply not on waiters");
+ goto out;
+ }
+
+ _receive_unlock_reply(ls, lkb, ms);
+ out:
+ put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_cancel() on the master */
+
+ switch (error) {
+ case -DLM_ECANCEL:
+ receive_flags_reply(lkb, ms);
+ revert_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ break;
+ default:
+ log_error(ls, "receive_cancel_reply unknown error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply not on waiters");
+ goto out;
+ }
+
+ _receive_cancel_reply(ls, lkb, ms);
+ out:
+ put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, ret_nodeid;
+
+ error = find_lkb(ls, ms->m_lkid, &lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply no lkb");
+ return;
+ }
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned by dlm_dir_lookup on dir node
+ FIXME: will a non-zero error ever be returned? */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ ret_nodeid = ms->m_nodeid;
+ if (ret_nodeid == dlm_our_nodeid())
+ r->res_nodeid = ret_nodeid = 0;
+ else {
+ r->res_nodeid = ret_nodeid;
+ r->res_trial_lkid = lkb->lkb_id;
+ }
+
+ _request_lock(r, lkb);
+
+ if (!ret_nodeid)
+ confirm_master(r, 0);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ put_lkb(lkb);
+}
+
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+ struct dlm_message *ms = (struct dlm_message *) hd;
+ struct dlm_ls *ls;
+ int error;
+
+ if (!recovery)
+ dlm_message_in(ms);
+
+ ls = dlm_find_lockspace_global(hd->h_lockspace);
+ if (!ls) {
+ log_print("drop message %d from %d for unknown lockspace %d",
+ ms->m_type, nodeid, hd->h_lockspace);
+ return -EINVAL;
+ }
+
+ /* recovery may have just ended leaving a bunch of backed-up requests
+ in the requestqueue; wait while dlm_recoverd clears them */
+
+ if (!recovery)
+ dlm_wait_requestqueue(ls);
+
+ /* recovery may have just started while there were a bunch of
+ in-flight requests -- save them in requestqueue to be processed
+ after recovery. we can't let dlm_recvd block on the recovery
+ lock. if dlm_recoverd is calling this function to clear the
+ requestqueue, it needs to be interrupted (-EINTR) if another
+ recovery operation is starting. */
+
+ while (1) {
+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+ if (!recovery)
+ dlm_add_requestqueue(ls, nodeid, hd);
+ error = -EINTR;
+ goto out;
+ }
+
+ if (lock_recovery_try(ls))
+ break;
+ schedule();
+ }
+
+ switch (ms->m_type) {
+
+ /* messages sent to a master node */
+
+ case DLM_MSG_REQUEST:
+ receive_request(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT:
+ receive_convert(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ receive_unlock(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL:
+ receive_cancel(ls, ms);
+ break;
+
+ /* messages sent from a master node (replies to above) */
+
+ case DLM_MSG_REQUEST_REPLY:
+ receive_request_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT_REPLY:
+ receive_convert_reply(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK_REPLY:
+ receive_unlock_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL_REPLY:
+ receive_cancel_reply(ls, ms);
+ break;
+
+ /* messages sent from a master node (only two types of async msg) */
+
+ case DLM_MSG_GRANT:
+ receive_grant(ls, ms);
+ break;
+
+ case DLM_MSG_BAST:
+ receive_bast(ls, ms);
+ break;
+
+ /* messages sent to a dir node */
+
+ case DLM_MSG_LOOKUP:
+ receive_lookup(ls, ms);
+ break;
+
+ case DLM_MSG_REMOVE:
+ receive_remove(ls, ms);
+ break;
+
+ /* messages sent from a dir node (remove has no reply) */
+
+ case DLM_MSG_LOOKUP_REPLY:
+ receive_lookup_reply(ls, ms);
+ break;
+
+ default:
+ log_error(ls, "unknown message type %d", ms->m_type);
+ }
+
+ unlock_recovery(ls);
+ out:
+ dlm_put_lockspace(ls);
+ dlm_astd_wake();
+ return 0;
+}
+
+
+/*
+ * Recovery related
+ */
+
+static int middle_conversion(struct dlm_lkb *lkb)
+{
+ if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+ (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+ return TRUE;
+ return FALSE;
+}
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ if (middle_conversion(lkb)) {
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -EINPROGRESS;
+ _remove_from_waiters(lkb);
+ _receive_convert_reply(ls, lkb, &ls->ls_stub_ms);
+
+ /* Same special case as in receive_rcom_lock_args() */
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ set_bit(RESFL_RECOVER_CONVERT, &lkb->lkb_resource->res_flags);
+ unhold_lkb(lkb);
+
+ } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+
+ } else if (lkb->lkb_rqmode < lkb->lkb_grmode) {
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = 0;
+ _remove_from_waiters(lkb);
+ _receive_convert_reply(ls, lkb, &ls->ls_stub_ms);
+ unhold_lkb(lkb);
+ }
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+ gone. We can just complete unlocks and cancels by faking a reply from the
+ dead node. Requests and up-conversions we just flag to be resent after
+ recovery. Down-conversions can just be completed with a fake reply like
+ unlocks. Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ down(&ls->ls_waiters_sem);
+
+ list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+ if (!dlm_is_removed(ls, lkb->lkb_nodeid))
+ continue;
+
+ log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+ lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+ switch (lkb->lkb_wait_type) {
+
+ case DLM_MSG_REQUEST:
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ break;
+
+ case DLM_MSG_CONVERT:
+ recover_convert_waiter(ls, lkb);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+ _remove_from_waiters(lkb);
+ _receive_unlock_reply(ls, lkb, &ls->ls_stub_ms);
+ put_lkb(lkb);
+ break;
+
+ case DLM_MSG_CANCEL:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+ _remove_from_waiters(lkb);
+ _receive_cancel_reply(ls, lkb, &ls->ls_stub_ms);
+ put_lkb(lkb);
+ break;
+
+ case DLM_MSG_LOOKUP:
+ /* all outstanding lookups, regardless of dest.
+ will be resent after recovery is done */
+ break;
+
+ default:
+ log_error(ls, "invalid lkb wait_type %d",
+ lkb->lkb_wait_type);
+ }
+ }
+ up(&ls->ls_waiters_sem);
+}
+
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ int rv = 0;
+
+ down(&ls->ls_waiters_sem);
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (lkb->lkb_flags & DLM_IFL_RESEND) {
+ rv = lkb->lkb_wait_type;
+ _remove_from_waiters(lkb);
+ lkb->lkb_flags &= ~DLM_IFL_RESEND;
+ break;
+ }
+ }
+ up(&ls->ls_waiters_sem);
+
+ if (!rv)
+ lkb = NULL;
+ *lkb_ret = lkb;
+ return rv;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
+ master or dir-node for r. Processing the lkb may result in it being placed
+ back on waiters. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error = 0, mstype;
+
+ while (1) {
+ if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
+ log_debug(ls, "recover_waiters_post aborted");
+ error = -EINTR;
+ break;
+ }
+
+ mstype = remove_resend_waiter(ls, &lkb);
+ if (!mstype)
+ break;
+
+ r = lkb->lkb_resource;
+
+ log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+ lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+
+ switch (mstype) {
+
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REQUEST:
+ hold_rsb(r);
+ lock_rsb(r);
+ _request_lock(r, lkb);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ case DLM_MSG_CONVERT:
+ hold_rsb(r);
+ lock_rsb(r);
+ _convert_lock(r, lkb);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ default:
+ log_error(ls, "recover_waiters_post type %d", mstype);
+ }
+ }
+
+ return error;
+}
+
+static int purge_queue(struct dlm_rsb *r, struct list_head *queue)
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+ if (!is_master_copy(lkb))
+ continue;
+
+ if (dlm_is_removed(ls, lkb->lkb_nodeid)) {
+ del_lkb(r, lkb);
+ /* this put should free the lkb */
+ if (!put_lkb(lkb))
+ log_error(ls, "purged lkb not released");
+ }
+ }
+ return 0;
+}
+
+/*
+ * Get rid of locks held by nodes that are gone.
+ */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+
+ log_debug(ls, "dlm_purge_locks");
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ hold_rsb(r);
+ lock_rsb(r);
+
+ purge_queue(r, &r->res_grantqueue);
+ purge_queue(r, &r->res_convertqueue);
+ purge_queue(r, &r->res_waitqueue);
+
+ unlock_rsb(r);
+ unhold_rsb(r);
+
+ schedule();
+ }
+ up_write(&ls->ls_root_sem);
+
+ return 0;
+}
+
+int dlm_grant_after_purge(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+ hold_rsb(r);
+ lock_rsb(r);
+ if (is_master(r))
+ grant_pending_locks(r);
+ unlock_rsb(r);
+ put_rsb(r);
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+
+ return 0;
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ return NULL;
+}
+
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+
+ lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+ lkb->lkb_ownpid = rl->rl_ownpid;
+ lkb->lkb_remid = rl->rl_lkid;
+ lkb->lkb_exflags = rl->rl_exflags;
+ lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ lkb->lkb_lvbseq = rl->rl_lvbseq;
+ lkb->lkb_rqmode = rl->rl_rqmode;
+ lkb->lkb_grmode = rl->rl_grmode;
+ /* don't set lkb_status because add_lkb wants to itself */
+
+ lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+
+ if (lkb->lkb_flags & DLM_IFL_RANGE) {
+ lkb->lkb_range = allocate_range(ls);
+ if (!lkb->lkb_range)
+ return -ENOMEM;
+ memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
+ }
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ memcpy(lkb->lkb_lvbptr, rl->rl_lvb, DLM_LVB_LEN);
+ }
+
+ /* Conversions between PR and CW (middle modes) need special handling.
+ The real granted mode of these converting locks cannot be determined
+ until all locks have been rebuilt on the rsb (recover_conversion) */
+
+ if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+ rl->rl_status = DLM_LKSTS_CONVERT;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ set_bit(RESFL_RECOVER_CONVERT, &r->res_flags);
+ }
+
+ return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+ to check if the rsb already has an lkb with the given remote nodeid/lkid.
+ If so we just send back a standard reply. If not, we create a new lkb with
+ the given values and send back our lkid. We send back our lkid by sending
+ back the rcom_lock struct we got but with the remid field filled in. */
+
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ if (rl->rl_parent_lkid) {
+ error = -EOPNOTSUPP;
+ goto out;
+ }
+
+ error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+ if (lkb) {
+ error = -EEXIST;
+ goto out_remid;
+ }
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto out_unlock;
+
+ error = receive_rcom_lock_args(ls, lkb, r, rc);
+ if (error) {
+ put_lkb(lkb);
+ goto out_unlock;
+ }
+
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, rl->rl_status);
+ error = 0;
+
+ out_remid:
+ /* this is the new value returned to the lock holder for
+ saving in its process-copy lkb */
+ rl->rl_remid = lkb->lkb_id;
+
+ out_unlock:
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ rl->rl_result = error;
+ return error;
+}
+
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, rl->rl_lkid, &lkb);
+ if (error) {
+ log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+ return error;
+ }
+
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = rl->rl_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ switch (error) {
+ case -EEXIST:
+ log_debug(ls, "master copy exists %x", lkb->lkb_id);
+ /* fall through */
+ case 0:
+ lkb->lkb_remid = rl->rl_remid;
+ break;
+ default:
+ log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+ error, lkb->lkb_id);
+ }
+
+ /* an ack for dlm_recover_locks() which waits for replies from
+ all the locks it sends to new masters */
+ dlm_recovered_lock(r);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ put_lkb(lkb);
+
+ return 0;
+}
+
+
David Teigland writes:
>
> The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
[...]
> +
> +static int is_remote(struct dlm_rsb *r)
> +{
> + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
> + return r->res_nodeid ? TRUE : FALSE;
> +}
This can be simply
return r->res_nodeid;
> +
> +static int is_master(struct dlm_rsb *r)
> +{
> + return r->res_nodeid ? FALSE : TRUE;
> +}
This duplicates dlm_is_master() for no obvious reason.
> +
> +int dlm_is_master(struct dlm_rsb *r)
> +{
> + return r->res_nodeid ? FALSE : TRUE;
> +}
This can be simply
return !r->res_nodeid;
Nikita.
A few small comments below. I just did a quick scan of the code - and
damn, there's a lot of it in one patch...
On Tue, 26 Apr 2005, David Teigland wrote:
>
> The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
> Creates lockspaces which give applications separate contexts/namespaces in
> which to do their locking. Manages locks on resources' grant/convert/wait
> queues. Sends and receives high level locking operations between nodes.
> Delivers completion and blocking callbacks (ast's) to lock holders.
> Manages the distributed directory that tracks the current master node for
> each resource.
>
> Signed-Off-By: Dave Teigland <[email protected]>
> Signed-Off-By: Patrick Caulfield <[email protected]>
>
> ---
>
> +void dlm_print_lkb(struct dlm_lkb *lkb)
> +{
> + printk("lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
^^^
Explicit loglevel here?
[...]
> +void dlm_print_rsb(struct dlm_rsb *r)
> +{
> + printk("rsb: nodeid %d flags %lx trial %x name %s\n",
^^^
Loglevel?
[...]
> +static int can_be_queued(struct dlm_lkb *lkb)
> +{
> + return (!(lkb->lkb_exflags & DLM_LKF_NOQUEUE));
return is not a function.
> +static int dir_remove(struct dlm_rsb *r)
> +{
> + int to_nodeid = dlm_dir_nodeid(r);
> +
> + if (to_nodeid != dlm_our_nodeid())
> + send_remove(r);
> + else
> + dlm_dir_remove_entry(r->res_ls, to_nodeid,
> + r->res_name, r->res_length);
> + return 0;
> +}
Always returning 0 - then why not just a void function?
> +static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
> + if (!error) {
> + kref_get(&r->res_ref);
> + goto out;
> + }
> + error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
> + if (!error) {
You could make this
if (error)
goto out;
and save a level of indentation for the remaining code.
> + list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
> +
> + if (r->res_nodeid == -1) {
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + clear_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
> + r->res_trial_lkid = 0;
> + } else if (r->res_nodeid > 0) {
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + set_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
> + r->res_trial_lkid = 0;
> + } else {
> + DLM_ASSERT(r->res_nodeid == 0,
> + dlm_print_rsb(r););
> + DLM_ASSERT(!test_bit(RESFL_MASTER_WAIT, &r->res_flags),
> + dlm_print_rsb(r););
> + DLM_ASSERT(!test_bit(RESFL_MASTER_UNCERTAIN,
> + &r->res_flags),);
> + }
> + }
> + out:
> + *r_ret = r;
> + return error;
> +}
[...]
> +void dlm_scan_rsbs(struct dlm_ls *ls)
> +{
> + int i, count = 0;
> +
> + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
> + return;
> +
> + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
> + count += shrink_bucket(ls, i);
> + cond_resched();
> + }
> +}
What's the use of the `count' variable here? it's a local variable, and
all you ever do is add values to it, you don't ever use the value of
`count' for anything. Why not just get rid of `count' alltogether?
> +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
> +{
> + struct dlm_lkb *lkb;
> + uint32_t lkid;
> + uint16_t bucket;
> +
> + lkb = allocate_lkb(ls);
> + if (!lkb)
> + return -ENOMEM;
> +
> + lkb->lkb_nodeid = -1;
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + kref_init(&lkb->lkb_ref);
> +
> + get_random_bytes(&bucket, sizeof(bucket));
> + bucket &= (ls->ls_lkbtbl_size - 1);
> +
> + write_lock(&ls->ls_lkbtbl[bucket].lock);
> + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
> + /* FIXME: do a find to verify lkid not in use */
^^^^^--- Why not fix that issue before merging with mainline?
> +
> + DLM_ASSERT(lkid, );
^^^--- looks like a parameter is mising.
[...]
> +/*
> + * Two stage 1 varieties: dlm_lock() and dlm_unlock()
> + */
> +
> +int dlm_lock(dlm_lockspace_t *lockspace,
> + int mode,
> + struct dlm_lksb *lksb,
> + uint32_t flags,
> + void *name,
> + unsigned int namelen,
> + uint32_t parent_lkid,
> + void (*ast) (void *astarg),
> + void *astarg,
> + void (*bast) (void *astarg, int mode),
> + struct dlm_range *range)
> +{
^^^^^ Why this difference in style for the function
parameters compared to the other functions?
Using a common style is often prefered.
[...]
> +/* change some property of an existing lkb, e.g. mode, range */
> +
> +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
This is just a personal preference of mine, but why the blank line between
the comment describing the function and the function itself? Many
functions, many comments, lots of blank lines that are really not needed,
lots of screen realestate wasted... Ohh well, no big deal, just my
personal preference is to not waste screen space like that.
[...]
> +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error;
> +
> + if (is_remote(r))
> + /* receive_unlock() calls call do_unlock() on remote node */
^^^^^^^^^^^--- small typo there?
[...]
> +static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int b;
> +
> + /* b=1 lvb returned to caller
> + b=0 lvb written to rsb or invalidated
> + b=-1 do nothing */
b==[1,0,-1] surely...?
[...]
> +static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + if (lkb->lkb_grmode < DLM_LOCK_PW)
> + return;
goto out;
> +
> + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
> + set_bit(RESFL_VALNOTVALID, &r->res_flags);
> + return;
goto out;
> + }
> +
> + if (!lkb->lkb_lvbptr)
> + return;
goto out;
> +
> + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> + return;
goto out;
> +
> + if (!r->res_lvbptr)
> + r->res_lvbptr = allocate_lvb(r->res_ls);
> +
> + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
> + r->res_lvbseq++;
> + clear_bit(RESFL_VALNOTVALID, &r->res_flags);
out:
return;
> +}
A single return function exit point instead of multiple reduces the risk
of errors when code is later modified.
Applies to many other functions besides this one (and this one may not
even be the best example, but hey, I wanted to make that comment, and
this function was at hand).
[...]
> +static int grant_pending_locks(struct dlm_rsb *r)
> +{
> + struct dlm_lkb *lkb, *s;
> + int high = DLM_LOCK_IV;
> +
> + DLM_ASSERT(is_master(r), dlm_print_rsb(r););
> +
> + high = grant_pending_convert(r, high);
> + high = grant_pending_wait(r, high);
> +
> + if (high == DLM_LOCK_IV)
> + return 0;
> +
> + /*
> + * If there are locks left on the wait/convert queue then send blocking
> + * ASTs to granted locks based on the largest requested mode (high)
> + * found above. This can generate spurious blocking ASTs for range
> + * locks. FIXME: highbast < high comparison not valid for PR/CW.
> + */
> +
> + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
> + if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
> + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
> + queue_bast(r, lkb, high);
> + lkb->lkb_highbast = high;
> + }
> + }
> +
> + return 0;
> +}
This function only ever returns 0 - why not make it return void instead?
[...]
> +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
> +{
> + dlm_message_out(ms);
> + dlm_lowcomms_commit_buffer(mh);
> + return 0;
> +}
make it return void instead? since you only ever return 0 anyway.
[...]
> +static int purge_queue(struct dlm_rsb *r, struct list_head *queue)
> +{
> + struct dlm_ls *ls = r->res_ls;
> + struct dlm_lkb *lkb, *safe;
> +
> + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
> + if (!is_master_copy(lkb))
> + continue;
> +
> + if (dlm_is_removed(ls, lkb->lkb_nodeid)) {
> + del_lkb(r, lkb);
> + /* this put should free the lkb */
> + if (!put_lkb(lkb))
> + log_error(ls, "purged lkb not released");
> + }
> + }
> + return 0;
> +}
If you only ever return 0, why return a value at all?
This is the case with many functions, a few of which I've most certainly
missed (and I'm not going to keep repeatign myself with this any more),
so take a closer look yourself :-)
[...]
> +}
> +
> +
Ok, this is nitpicking in the extreme; one newline at end-of-file is
super, but surely you don't need two. ;-)
One other general thing; you seem to have a lot of functions that return
0/1 and a lot that return TRUE/FALSE - why not be consistent?
--
Jesper Juhl
Hi Nikita,
On Monday 25 April 2005 14:34, you wrote:
> > +
> > +static int is_remote(struct dlm_rsb *r)
> > +{
> > + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
> > + return r->res_nodeid ? TRUE : FALSE;
> > +}
>
> This can be simply
>
> return r->res_nodeid;
Not quite the same. Perhaps you meant:
return !!r->res_nodeid;
Regards,
Daniel
On Mon, 2005-04-25 at 09:58, David Teigland wrote:
> The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
> Creates lockspaces which give applications separate contexts/namespaces in
> which to do their locking. Manages locks on resources' grant/convert/wait
> queues. Sends and receives high level locking operations between nodes.
> Delivers completion and blocking callbacks (ast's) to lock holders.
> Manages the distributed directory that tracks the current master node for
> each resource.
>
David
Very positive there are some submissions relating to cluster kernel work
for lkml to review.. good job..
I have some questions on the implementation:
It appears as though a particular processor is identified as the "lock
master" or processor that maintains the state of the lock. So for
example, if a processor wants to acquire a lock, it sends a reqeust to
the lock master which either grants or rejects the request for the
lock. What happens in the scenario that a lock master leaves the
current configuration? This scneario is very likely in practice. How
do you synchronize the membership events that occur with the kernel to
kernel communication that takes place using SCTP?
It appears from your patches there is some external (userland)
application that maintains the current list of processors that qualify
as "lock servers". Is there then a dependence on external membership
algorithms? What user application today works to configure the dlm
services in the posted patch?
With usage of SCTP protocol, there is now some idea of moving the
protocol for cluster communication into the kernel and using SCTP as
that protocol...
I wonder if you couldn't benefit from a virtual synchrony protocol
available for kernel use for communicating lock state to processors
within the configuration. I know you have mentioned in the past this
might work for you... Could you expand on how you see these sorts of
communications services being of use to the redhat dlm? Or are you
planning to stick with SCTP for intra-processor lock state
communicaton?
Finally, the openais project's evs service could really benefit from
your comments on services desired by kernel dlm. Any guidance you could
provide here would be valuable. I know you had mentioned in the cluster
sig that there is no need for communication in the kernel and there are
plans to do that stuff in userland.. I would like to map this out in
relation to the current reliance on SCTP for a communication protocol to
communicate lock states that currently resides in the kernel for these
patches...
regards
-steve
> Signed-Off-By: Dave Teigland <[email protected]>
> Signed-Off-By: Patrick Caulfield <[email protected]>
>
> ---
>
> drivers/dlm/lock.c | 3546 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 files changed, 3546 insertions(+)
>
> --- a/drivers/dlm/lock.c 1970-01-01 07:30:00.000000000 +0730
> +++ b/drivers/dlm/lock.c 2005-04-25 22:52:03.924821624 +0800
> @@ -0,0 +1,3546 @@
> +/******************************************************************************
> +*******************************************************************************
> +**
> +** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
> +**
> +** This copyrighted material is made available to anyone wishing to use,
> +** modify, copy, or redistribute it subject to the terms and conditions
> +** of the GNU General Public License v.2.
> +**
> +*******************************************************************************
> +******************************************************************************/
> +
> +#include "dlm_internal.h"
> +#include "memory.h"
> +#include "lowcomms.h"
> +#include "requestqueue.h"
> +#include "util.h"
> +#include "dir.h"
> +#include "member.h"
> +#include "lockspace.h"
> +#include "ast.h"
> +#include "lock.h"
> +#include "rcom.h"
> +#include "recover.h"
> +#include "lvb_table.h"
> +
> +/* Central locking logic has four stages:
> +
> + dlm_lock()
> + dlm_unlock()
> +
> + request_lock(ls, lkb)
> + convert_lock(ls, lkb)
> + unlock_lock(ls, lkb)
> + cancel_lock(ls, lkb)
> +
> + _request_lock(r, lkb)
> + _convert_lock(r, lkb)
> + _unlock_lock(r, lkb)
> + _cancel_lock(r, lkb)
> +
> + do_request(r, lkb)
> + do_convert(r, lkb)
> + do_unlock(r, lkb)
> + do_cancel(r, lkb)
> +
> +
> + Stage 1 (lock, unlock) is mainly about checking input args and
> + splitting into one of the four main operations:
> +
> + dlm_lock = request_lock
> + dlm_lock+CONVERT = convert_lock
> + dlm_unlock = unlock_lock
> + dlm_unlock+CANCEL = cancel_lock
> +
> + Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
> + provided to the next stage.
> +
> + Stage 3, _xxxx_lock(), determines if the operation is local or remote.
> + When remote, it calls send_xxxx(), when local it calls do_xxxx().
> +
> + Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
> + given rsb and lkb and queues callbacks.
> +
> +
> + For remote operations, the send_xxxx() results in the corresponding
> + do_xxxx() function being executed on the remote node. The connecting
> + send/receive calls on local (L) and remote (R) nodes:
> +
> + L: send_xxxx() -> R: receive_xxxx()
> + R: do_xxxx()
> + L: receive_xxxx_reply() <- R: send_xxxx_reply()
> +*/
> +
> +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
> + int len, struct dlm_args *args);
> +static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args);
> +static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args);
> +static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args);
> +
> +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +
> +static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +
> +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
> +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
> +static int send_remove(struct dlm_rsb *r);
> +
> +
> +/*
> + * Lock compatibilty matrix - thanks Steve
> + * UN = Unlocked state. Not really a state, used as a flag
> + * PD = Padding. Used to make the matrix a nice power of two in size
> + * Other states are the same as the VMS DLM.
> + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
> + */
> +
> +const int __dlm_compat_matrix[8][8] = {
> + /* UN NL CR CW PR PW EX PD */
> + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
> + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
> + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
> + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
> + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
> + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
> + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
> + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
> +};
> +
> +#define modes_compat(gr, rq) \
> + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
> +
> +int dlm_modes_compat(int mode1, int mode2)
> +{
> + return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
> +}
> +
> +/*
> + * Compatibility matrix for conversions with QUECVT set.
> + * Granted mode is the row; requested mode is the column.
> + * Usage: matrix[grmode+1][rqmode+1]
> + */
> +
> +const int __quecvt_compat_matrix[8][8] = {
> + /* UN NL CR CW PR PW EX PD */
> + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
> + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
> + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
> + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
> + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
> + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
> + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
> + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
> +};
> +
> +void dlm_print_lkb(struct dlm_lkb *lkb)
> +{
> + printk("lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
> + " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
> + lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
> + lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
> + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
> +}
> +
> +void dlm_print_rsb(struct dlm_rsb *r)
> +{
> + printk("rsb: nodeid %d flags %lx trial %x name %s\n",
> + r->res_nodeid, r->res_flags, r->res_trial_lkid, r->res_name);
> +}
> +
> +/* Threads cannot use the lockspace while it's being recovered */
> +
> +static void lock_recovery(struct dlm_ls *ls)
> +{
> + down_read(&ls->ls_in_recovery);
> +}
> +
> +static void unlock_recovery(struct dlm_ls *ls)
> +{
> + up_read(&ls->ls_in_recovery);
> +}
> +
> +static int lock_recovery_try(struct dlm_ls *ls)
> +{
> + return down_read_trylock(&ls->ls_in_recovery);
> +}
> +
> +static int can_be_queued(struct dlm_lkb *lkb)
> +{
> + return (!(lkb->lkb_exflags & DLM_LKF_NOQUEUE));
> +}
> +
> +static int force_blocking_asts(struct dlm_lkb *lkb)
> +{
> + return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
> +}
> +
> +static int is_demoted(struct dlm_lkb *lkb)
> +{
> + return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
> +}
> +
> +static int is_remote(struct dlm_rsb *r)
> +{
> + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
> + return r->res_nodeid ? TRUE : FALSE;
> +}
> +
> +static int is_master(struct dlm_rsb *r)
> +{
> + return r->res_nodeid ? FALSE : TRUE;
> +}
> +
> +int dlm_is_master(struct dlm_rsb *r)
> +{
> + return r->res_nodeid ? FALSE : TRUE;
> +}
> +
> +static int is_process_copy(struct dlm_lkb *lkb)
> +{
> + return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
> +}
> +
> +static int is_master_copy(struct dlm_lkb *lkb)
> +{
> + if (lkb->lkb_flags & DLM_IFL_MSTCPY)
> + DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
> + return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE;
> +}
> +
> +static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
> +{
> + if (is_master_copy(lkb))
> + return;
> +
> + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
> +
> + lkb->lkb_lksb->sb_status = rv;
> + lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
> +
> + dlm_add_ast(lkb, AST_COMP);
> +}
> +
> +static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
> +{
> + if (is_master_copy(lkb))
> + send_bast(r, lkb, rqmode);
> + else {
> + lkb->lkb_bastmode = rqmode;
> + dlm_add_ast(lkb, AST_BAST);
> + }
> +}
> +
> +static int dir_remove(struct dlm_rsb *r)
> +{
> + int to_nodeid = dlm_dir_nodeid(r);
> +
> + if (to_nodeid != dlm_our_nodeid())
> + send_remove(r);
> + else
> + dlm_dir_remove_entry(r->res_ls, to_nodeid,
> + r->res_name, r->res_length);
> + return 0;
> +}
> +
> +
> +/*
> + * Basic operations on rsb's and lkb's
> + */
> +
> +static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
> +{
> + struct dlm_rsb *r;
> +
> + r = allocate_rsb(ls, len);
> + if (!r)
> + return NULL;
> +
> + r->res_ls = ls;
> + r->res_length = len;
> + memcpy(r->res_name, name, len);
> + init_MUTEX(&r->res_sem);
> +
> + INIT_LIST_HEAD(&r->res_lookup);
> + INIT_LIST_HEAD(&r->res_grantqueue);
> + INIT_LIST_HEAD(&r->res_convertqueue);
> + INIT_LIST_HEAD(&r->res_waitqueue);
> + INIT_LIST_HEAD(&r->res_root_list);
> + INIT_LIST_HEAD(&r->res_recover_list);
> +
> + return r;
> +}
> +
> +static int search_rsb_list(struct list_head *head, char *name, int len,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + struct dlm_rsb *r;
> + int error = 0;
> +
> + list_for_each_entry(r, head, res_hashchain) {
> + if (len == r->res_length && !memcmp(name, r->res_name, len))
> + goto found;
> + }
> + return -ENOENT;
> +
> + found:
> + if (r->res_nodeid && (flags & R_MASTER))
> + error = -ENOTBLK;
> + *r_ret = r;
> + return error;
> +}
> +
> +static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
> + if (!error) {
> + kref_get(&r->res_ref);
> + goto out;
> + }
> + error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
> + if (!error) {
> + list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
> +
> + if (r->res_nodeid == -1) {
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + clear_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
> + r->res_trial_lkid = 0;
> + } else if (r->res_nodeid > 0) {
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + set_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags);
> + r->res_trial_lkid = 0;
> + } else {
> + DLM_ASSERT(r->res_nodeid == 0,
> + dlm_print_rsb(r););
> + DLM_ASSERT(!test_bit(RESFL_MASTER_WAIT, &r->res_flags),
> + dlm_print_rsb(r););
> + DLM_ASSERT(!test_bit(RESFL_MASTER_UNCERTAIN,
> + &r->res_flags),);
> + }
> + }
> + out:
> + *r_ret = r;
> + return error;
> +}
> +
> +static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + int error;
> + write_lock(&ls->ls_rsbtbl[b].lock);
> + error = _search_rsb(ls, name, len, b, flags, r_ret);
> + write_unlock(&ls->ls_rsbtbl[b].lock);
> + return error;
> +}
> +
> +/*
> + * Find rsb in rsbtbl and potentially create/add one
> + *
> + * Delaying the release of rsb's has a similar benefit to applications keeping
> + * NL locks on an rsb, but without the guarantee that the cached master value
> + * will still be valid when the rsb is reused. Apps aren't always smart enough
> + * to keep NL locks on an rsb that they may lock again shortly; this can lead
> + * to excessive master lookups and removals if we don't delay the release.
> + *
> + * Searching for an rsb means looking through both the normal list and toss
> + * list. When found on the toss list the rsb is moved to the normal list with
> + * ref count of 1; when found on normal list the ref count is incremented.
> + */
> +
> +static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + struct dlm_rsb *r, *tmp;
> + uint32_t bucket;
> + int error = 0;
> +
> + bucket = dlm_hash(name, namelen);
> + bucket &= (ls->ls_rsbtbl_size - 1);
> +
> + error = search_rsb(ls, name, namelen, bucket, flags, &r);
> + if (!error)
> + goto out;
> +
> + if (error == -ENOENT && !(flags & R_CREATE))
> + goto out;
> +
> + /* the rsb was found but wasn't a master copy */
> + if (error == -ENOTBLK)
> + goto out;
> +
> + error = -ENOMEM;
> + r = create_rsb(ls, name, namelen);
> + if (!r)
> + goto out;
> +
> + r->res_bucket = bucket;
> + r->res_nodeid = -1;
> + kref_init(&r->res_ref);
> +
> + write_lock(&ls->ls_rsbtbl[bucket].lock);
> + error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
> + if (!error) {
> + write_unlock(&ls->ls_rsbtbl[bucket].lock);
> + free_rsb(r);
> + r = tmp;
> + goto out;
> + }
> + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
> + write_unlock(&ls->ls_rsbtbl[bucket].lock);
> + error = 0;
> + out:
> + *r_ret = r;
> + return error;
> +}
> +
> +int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
> + unsigned int flags, struct dlm_rsb **r_ret)
> +{
> + return find_rsb(ls, name, namelen, flags, r_ret);
> +}
> +
> +/* This is only called to add a reference when the code already holds
> + a valid reference to the rsb, so there's no need for locking. */
> +
> +static void hold_rsb(struct dlm_rsb *r)
> +{
> + kref_get(&r->res_ref);
> +}
> +
> +void dlm_hold_rsb(struct dlm_rsb *r)
> +{
> + hold_rsb(r);
> +}
> +
> +static void toss_rsb(struct kref *kref)
> +{
> + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
> + struct dlm_ls *ls = r->res_ls;
> +
> + DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
> + kref_init(&r->res_ref);
> + list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
> + r->res_toss_time = jiffies;
> + if (r->res_lvbptr) {
> + free_lvb(r->res_lvbptr);
> + r->res_lvbptr = NULL;
> + }
> +}
> +
> +/* When all references to the rsb are gone it's transfered to
> + the tossed list for later disposal. */
> +
> +static void put_rsb(struct dlm_rsb *r)
> +{
> + struct dlm_ls *ls = r->res_ls;
> + uint32_t bucket = r->res_bucket;
> +
> + write_lock(&ls->ls_rsbtbl[bucket].lock);
> + kref_put(&r->res_ref, toss_rsb);
> + write_unlock(&ls->ls_rsbtbl[bucket].lock);
> +}
> +
> +void dlm_put_rsb(struct dlm_rsb *r)
> +{
> + put_rsb(r);
> +}
> +
> +/* See comment for unhold_lkb */
> +
> +static void unhold_rsb(struct dlm_rsb *r)
> +{
> + int rv;
> + rv = kref_put(&r->res_ref, toss_rsb);
> + DLM_ASSERT(!rv, dlm_print_rsb(r););
> +}
> +
> +static void kill_rsb(struct kref *kref)
> +{
> + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
> +
> + /* All work is done after the return from kref_put() so we
> + can release the write_lock before the remove and free. */
> +
> + DLM_ASSERT(list_empty(&r->res_lookup),);
> + DLM_ASSERT(list_empty(&r->res_grantqueue),);
> + DLM_ASSERT(list_empty(&r->res_convertqueue),);
> + DLM_ASSERT(list_empty(&r->res_waitqueue),);
> + DLM_ASSERT(list_empty(&r->res_root_list),);
> + DLM_ASSERT(list_empty(&r->res_recover_list),);
> +}
> +
> +/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
> + found since they are in order of newest to oldest? */
> +
> +static int shrink_bucket(struct dlm_ls *ls, int b)
> +{
> + struct dlm_rsb *r;
> + int count = 0, found;
> +
> + for (;;) {
> + found = FALSE;
> + write_lock(&ls->ls_rsbtbl[b].lock);
> + list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
> + res_hashchain) {
> + if (!time_after_eq(jiffies, r->res_toss_time +
> + DLM_TOSS_SECS * HZ))
> + continue;
> + found = TRUE;
> + break;
> + }
> +
> + if (!found) {
> + write_unlock(&ls->ls_rsbtbl[b].lock);
> + break;
> + }
> +
> + if (kref_put(&r->res_ref, kill_rsb)) {
> + list_del(&r->res_hashchain);
> + write_unlock(&ls->ls_rsbtbl[b].lock);
> +
> + if (is_master(r))
> + dir_remove(r);
> + free_rsb(r);
> + count++;
> + } else {
> + write_unlock(&ls->ls_rsbtbl[b].lock);
> + log_error(ls, "tossed rsb in use %s", r->res_name);
> + }
> + }
> +
> + return count;
> +}
> +
> +void dlm_scan_rsbs(struct dlm_ls *ls)
> +{
> + int i, count = 0;
> +
> + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags))
> + return;
> +
> + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
> + count += shrink_bucket(ls, i);
> + cond_resched();
> + }
> +}
> +
> +/* exclusive access to rsb and all its locks */
> +
> +static void lock_rsb(struct dlm_rsb *r)
> +{
> + down(&r->res_sem);
> +}
> +
> +static void unlock_rsb(struct dlm_rsb *r)
> +{
> + up(&r->res_sem);
> +}
> +
> +void dlm_lock_rsb(struct dlm_rsb *r)
> +{
> + lock_rsb(r);
> +}
> +
> +void dlm_unlock_rsb(struct dlm_rsb *r)
> +{
> + unlock_rsb(r);
> +}
> +
> +/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
> + The rsb must exist as long as any lkb's for it do. */
> +
> +static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + hold_rsb(r);
> + lkb->lkb_resource = r;
> +}
> +
> +static void detach_lkb(struct dlm_lkb *lkb)
> +{
> + if (lkb->lkb_resource) {
> + put_rsb(lkb->lkb_resource);
> + lkb->lkb_resource = NULL;
> + }
> +}
> +
> +static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
> +{
> + struct dlm_lkb *lkb;
> + uint32_t lkid;
> + uint16_t bucket;
> +
> + lkb = allocate_lkb(ls);
> + if (!lkb)
> + return -ENOMEM;
> +
> + lkb->lkb_nodeid = -1;
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + kref_init(&lkb->lkb_ref);
> +
> + get_random_bytes(&bucket, sizeof(bucket));
> + bucket &= (ls->ls_lkbtbl_size - 1);
> +
> + write_lock(&ls->ls_lkbtbl[bucket].lock);
> + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
> + /* FIXME: do a find to verify lkid not in use */
> +
> + DLM_ASSERT(lkid, );
> +
> + lkb->lkb_id = lkid;
> + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
> + write_unlock(&ls->ls_lkbtbl[bucket].lock);
> +
> + *lkb_ret = lkb;
> + return 0;
> +}
> +
> +static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
> +{
> + uint16_t bucket = lkid & 0xFFFF;
> + struct dlm_lkb *lkb;
> +
> + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
> + if (lkb->lkb_id == lkid)
> + return lkb;
> + }
> + return NULL;
> +}
> +
> +static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
> +{
> + struct dlm_lkb *lkb;
> + uint16_t bucket = lkid & 0xFFFF;
> +
> + if (bucket >= ls->ls_lkbtbl_size)
> + return -EBADSLT;
> +
> + read_lock(&ls->ls_lkbtbl[bucket].lock);
> + lkb = __find_lkb(ls, lkid);
> + if (lkb)
> + kref_get(&lkb->lkb_ref);
> + read_unlock(&ls->ls_lkbtbl[bucket].lock);
> +
> + *lkb_ret = lkb;
> + return lkb ? 0 : -ENOENT;
> +}
> +
> +static void kill_lkb(struct kref *kref)
> +{
> + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
> +
> + /* All work is done after the return from kref_put() so we
> + can release the write_lock before the detach_lkb */
> +
> + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
> +}
> +
> +static int put_lkb(struct dlm_lkb *lkb)
> +{
> + struct dlm_ls *ls = lkb->lkb_resource->res_ls;
> + uint16_t bucket = lkb->lkb_id & 0xFFFF;
> +
> + write_lock(&ls->ls_lkbtbl[bucket].lock);
> + if (kref_put(&lkb->lkb_ref, kill_lkb)) {
> + list_del(&lkb->lkb_idtbl_list);
> + write_unlock(&ls->ls_lkbtbl[bucket].lock);
> +
> + detach_lkb(lkb);
> +
> + /* for local/process lkbs, lvbptr points to caller's lksb */
> + if (lkb->lkb_lvbptr && is_master_copy(lkb))
> + free_lvb(lkb->lkb_lvbptr);
> + if (lkb->lkb_range)
> + free_range(lkb->lkb_range);
> + free_lkb(lkb);
> + return 1;
> + } else {
> + write_unlock(&ls->ls_lkbtbl[bucket].lock);
> + return 0;
> + }
> +}
> +
> +int dlm_put_lkb(struct dlm_lkb *lkb)
> +{
> + return put_lkb(lkb);
> +}
> +
> +/* This is only called to add a reference when the code already holds
> + a valid reference to the lkb, so there's no need for locking. */
> +
> +static void hold_lkb(struct dlm_lkb *lkb)
> +{
> + kref_get(&lkb->lkb_ref);
> +}
> +
> +/* This is called when we need to remove a reference and are certain
> + it's not the last ref. e.g. del_lkb is always called between a
> + find_lkb/put_lkb and is always the inverse of a previous add_lkb.
> + put_lkb would work fine, but would involve unnecessary locking */
> +
> +static void unhold_lkb(struct dlm_lkb *lkb)
> +{
> + int rv;
> + rv = kref_put(&lkb->lkb_ref, kill_lkb);
> + DLM_ASSERT(!rv, dlm_print_lkb(lkb););
> +}
> +
> +static void lkb_add_ordered(struct list_head *new, struct list_head *head,
> + int mode)
> +{
> + struct dlm_lkb *lkb = NULL;
> +
> + list_for_each_entry(lkb, head, lkb_statequeue)
> + if (lkb->lkb_rqmode < mode)
> + break;
> +
> + if (!lkb)
> + list_add_tail(new, head);
> + else
> + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
> +}
> +
> +/* add/remove lkb to rsb's grant/convert/wait queue */
> +
> +static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
> +{
> + kref_get(&lkb->lkb_ref);
> +
> + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
> +
> + lkb->lkb_status = status;
> +
> + switch (status) {
> + case DLM_LKSTS_WAITING:
> + if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
> + list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
> + else
> + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
> + break;
> + case DLM_LKSTS_GRANTED:
> + /* convention says granted locks kept in order of grmode */
> + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
> + lkb->lkb_grmode);
> + break;
> + case DLM_LKSTS_CONVERT:
> + if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
> + list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
> + else
> + list_add_tail(&lkb->lkb_statequeue,
> + &r->res_convertqueue);
> + break;
> + default:
> + DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
> + }
> +}
> +
> +static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + lkb->lkb_status = 0;
> + list_del(&lkb->lkb_statequeue);
> + unhold_lkb(lkb);
> +}
> +
> +static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
> +{
> + hold_lkb(lkb);
> + del_lkb(r, lkb);
> + add_lkb(r, lkb, sts);
> + unhold_lkb(lkb);
> +}
> +
> +/* add/remove lkb from global waiters list of lkb's waiting for
> + a reply from a remote node */
> +
> +static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
> +{
> + struct dlm_ls *ls = lkb->lkb_resource->res_ls;
> +
> + down(&ls->ls_waiters_sem);
> + if (lkb->lkb_wait_type) {
> + printk("add_to_waiters error %d", lkb->lkb_wait_type);
> + goto out;
> + }
> + lkb->lkb_wait_type = mstype;
> + kref_get(&lkb->lkb_ref);
> + list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
> + out:
> + up(&ls->ls_waiters_sem);
> +}
> +
> +static int _remove_from_waiters(struct dlm_lkb *lkb)
> +{
> + int error = 0;
> +
> + if (!lkb->lkb_wait_type) {
> + printk("remove_from_waiters error");
> + error = -EINVAL;
> + goto out;
> + }
> + lkb->lkb_wait_type = 0;
> + list_del(&lkb->lkb_wait_reply);
> + unhold_lkb(lkb);
> + out:
> + return error;
> +}
> +
> +static int remove_from_waiters(struct dlm_lkb *lkb)
> +{
> + struct dlm_ls *ls = lkb->lkb_resource->res_ls;
> + int error;
> +
> + down(&ls->ls_waiters_sem);
> + error = _remove_from_waiters(lkb);
> + up(&ls->ls_waiters_sem);
> + return error;
> +}
> +
> +int dlm_remove_from_waiters(struct dlm_lkb *lkb)
> +{
> + return remove_from_waiters(lkb);
> +}
> +
> +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
> + int namelen, uint32_t parent_lkid, void *ast,
> + void *astarg, void *bast, struct dlm_range *range,
> + struct dlm_args *args)
> +{
> + int rv = -EINVAL;
> +
> + /* check for invalid arg usage */
> +
> + if (mode < 0 || mode > DLM_LOCK_EX)
> + goto out;
> +
> + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
> + goto out;
> +
> + if (flags & DLM_LKF_CANCEL)
> + goto out;
> +
> + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
> + goto out;
> +
> + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
> + goto out;
> +
> + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
> + goto out;
> +
> + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
> + goto out;
> +
> + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
> + goto out;
> +
> + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
> + goto out;
> +
> + if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
> + goto out;
> +
> + if (!ast || !lksb)
> + goto out;
> +
> + if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
> + goto out;
> +
> + /* parent/child locks not yet supported */
> + if (parent_lkid)
> + goto out;
> +
> + if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
> + goto out;
> +
> + /* these args will be copied to the lkb in validate_lock_args,
> + it cannot be done now because when converting locks, fields in
> + an active lkb cannot be modified before locking the rsb */
> +
> + args->flags = flags;
> + args->astaddr = ast;
> + args->astparam = (long) astarg;
> + args->bastaddr = bast;
> + args->mode = mode;
> + args->lksb = lksb;
> + args->range = range;
> + rv = 0;
> + out:
> + return rv;
> +}
> +
> +static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
> +{
> + if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK))
> + return -EINVAL;
> +
> + args->flags = flags;
> + args->astparam = (long) astarg;
> + return 0;
> +}
> +
> +/*
> + * Two stage 1 varieties: dlm_lock() and dlm_unlock()
> + */
> +
> +int dlm_lock(dlm_lockspace_t *lockspace,
> + int mode,
> + struct dlm_lksb *lksb,
> + uint32_t flags,
> + void *name,
> + unsigned int namelen,
> + uint32_t parent_lkid,
> + void (*ast) (void *astarg),
> + void *astarg,
> + void (*bast) (void *astarg, int mode),
> + struct dlm_range *range)
> +{
> + struct dlm_ls *ls;
> + struct dlm_lkb *lkb;
> + struct dlm_args args;
> + int error, convert = flags & DLM_LKF_CONVERT;
> +
> + ls = dlm_find_lockspace_local(lockspace);
> + if (!ls)
> + return -EINVAL;
> +
> + lock_recovery(ls);
> +
> + if (convert)
> + error = find_lkb(ls, lksb->sb_lkid, &lkb);
> + else
> + error = create_lkb(ls, &lkb);
> +
> + if (error)
> + goto out;
> +
> + error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
> + astarg, bast, range, &args);
> + if (error)
> + goto out_put;
> +
> + if (convert)
> + error = convert_lock(ls, lkb, &args);
> + else
> + error = request_lock(ls, lkb, name, namelen, &args);
> +
> + if (error == -EINPROGRESS)
> + error = 0;
> + out_put:
> + if (convert || error)
> + put_lkb(lkb);
> + if (error == -EAGAIN)
> + error = 0;
> + out:
> + unlock_recovery(ls);
> + dlm_put_lockspace(ls);
> + return error;
> +}
> +
> +int dlm_unlock(dlm_lockspace_t *lockspace,
> + uint32_t lkid,
> + uint32_t flags,
> + struct dlm_lksb *lksb,
> + void *astarg)
> +{
> + struct dlm_ls *ls;
> + struct dlm_lkb *lkb;
> + struct dlm_args args;
> + int error;
> +
> + ls = dlm_find_lockspace_local(lockspace);
> + if (!ls)
> + return -EINVAL;
> +
> + lock_recovery(ls);
> +
> + error = find_lkb(ls, lkid, &lkb);
> + if (error)
> + goto out;
> +
> + error = set_unlock_args(flags, astarg, &args);
> + if (error)
> + goto out_put;
> +
> + if (flags & DLM_LKF_CANCEL)
> + error = cancel_lock(ls, lkb, &args);
> + else
> + error = unlock_lock(ls, lkb, &args);
> +
> + if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
> + error = 0;
> + out_put:
> + put_lkb(lkb);
> + out:
> + unlock_recovery(ls);
> + dlm_put_lockspace(ls);
> + return error;
> +}
> +
> +
> +/* set_master(r, lkb) -- set the master nodeid of a resource
> +
> + The purpose of this function is to set the nodeid field in the given
> + lkb using the nodeid field in the given rsb. If the rsb's nodeid is
> + known, it can just be copied to the lkb and the function will return
> + 0. If the rsb's nodeid is _not_ known, it needs to be looked up
> + before it can be copied to the lkb.
> +
> + When the rsb nodeid is being looked up remotely, the initial lkb
> + causing the lookup is kept on the ls_waiters list waiting for the
> + lookup reply. Other lkb's waiting for the same rsb lookup are kept
> + on the rsb's res_lookup list until the master is verified.
> +
> + After a remote lookup or when a tossed rsb is retrived that specifies
> + a remote master, that master value is uncertain -- it may have changed
> + by the time we send it a request. While it's uncertain, only one lkb
> + is allowed to go ahead and use the master value; that lkb is specified
> + by res_trial_lkid. Once the trial lkb is queued on the master node
> + we know the rsb master is correct and any other lkbs on res_lookup
> + can get the rsb nodeid and go ahead with their request.
> +
> + Return values:
> + 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
> + 1: the rsb master is not available and the lkb has been placed on
> + a wait queue
> + -EXXX: there was some error in processing
> +*/
> +
> +static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + struct dlm_ls *ls = r->res_ls;
> + int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
> +
> + if (test_and_clear_bit(RESFL_MASTER_UNCERTAIN, &r->res_flags)) {
> + set_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + r->res_trial_lkid = lkb->lkb_id;
> + lkb->lkb_nodeid = r->res_nodeid;
> + return 0;
> + }
> +
> + if (r->res_nodeid == 0) {
> + lkb->lkb_nodeid = 0;
> + return 0;
> + }
> +
> + if (r->res_trial_lkid == lkb->lkb_id) {
> + DLM_ASSERT(lkb->lkb_id, dlm_print_lkb(lkb););
> + lkb->lkb_nodeid = r->res_nodeid;
> + return 0;
> + }
> +
> + if (test_bit(RESFL_MASTER_WAIT, &r->res_flags)) {
> + list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
> + return 1;
> + }
> +
> + if (r->res_nodeid > 0) {
> + lkb->lkb_nodeid = r->res_nodeid;
> + return 0;
> + }
> +
> + /* This is the first lkb requested on this rsb since the rsb
> + was created. We need to figure out who the rsb master is. */
> +
> + DLM_ASSERT(r->res_nodeid == -1, );
> +
> + dir_nodeid = dlm_dir_nodeid(r);
> +
> + if (dir_nodeid != our_nodeid) {
> + set_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + send_lookup(r, lkb);
> + return 1;
> + }
> +
> + for (;;) {
> + /* It's possible for dlm_scand to remove an old rsb for
> + this same resource from the toss list, us to create
> + a new one, look up the master locally, and find it
> + already exists just before dlm_scand does the
> + dir_remove() on the previous rsb. */
> +
> + error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
> + r->res_length, &ret_nodeid);
> + if (!error)
> + break;
> + log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
> + schedule();
> + }
> +
> + if (ret_nodeid == our_nodeid) {
> + r->res_nodeid = 0;
> + lkb->lkb_nodeid = 0;
> + return 0;
> + }
> +
> + set_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + r->res_trial_lkid = lkb->lkb_id;
> + r->res_nodeid = ret_nodeid;
> + lkb->lkb_nodeid = ret_nodeid;
> + return 0;
> +}
> +
> +/* confirm_master -- confirm (or deny) an rsb's master nodeid
> +
> + This is called when we get a request reply from a remote node
> + who we believe is the master. The return value (error) we got
> + back indicates whether it's really the master or not. If it
> + wasn't we need to start over and do another master lookup. If
> + it was and our lock was queued we know the master won't change.
> + If it was and our lock wasn't queued, we need to do another
> + trial with the next lkb.
> +*/
> +
> +static void confirm_master(struct dlm_rsb *r, int error)
> +{
> + struct dlm_lkb *lkb, *safe;
> +
> + if (!test_bit(RESFL_MASTER_WAIT, &r->res_flags))
> + return;
> +
> + switch (error) {
> + case 0:
> + case -EINPROGRESS:
> + /* the remote master queued our request, or
> + the remote dir node told us we're the master */
> +
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + r->res_trial_lkid = 0;
> +
> + list_for_each_entry_safe(lkb, safe, &r->res_lookup,
> + lkb_rsb_lookup) {
> + list_del(&lkb->lkb_rsb_lookup);
> + _request_lock(r, lkb);
> + schedule();
> + }
> + break;
> +
> + case -EAGAIN:
> + /* the remote master didn't queue our NOQUEUE request;
> + do another trial with the next waiting lkb */
> +
> + if (!list_empty(&r->res_lookup)) {
> + lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
> + lkb_rsb_lookup);
> + list_del(&lkb->lkb_rsb_lookup);
> + r->res_trial_lkid = lkb->lkb_id;
> + _request_lock(r, lkb);
> + break;
> + }
> + /* fall through so the rsb looks new */
> +
> + case -ENOENT:
> + case -ENOTBLK:
> + /* the remote master wasn't really the master, i.e. our
> + trial failed; so we start over with another lookup */
> +
> + r->res_nodeid = -1;
> + r->res_trial_lkid = 0;
> + clear_bit(RESFL_MASTER_WAIT, &r->res_flags);
> + break;
> +
> + default:
> + log_error(r->res_ls, "confirm_master unknown error %d", error);
> + }
> +}
> +
> +int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args)
> +{
> + int rv = -EINVAL;
> +
> + if (args->flags & DLM_LKF_CONVERT) {
> + if (lkb->lkb_flags & DLM_IFL_MSTCPY)
> + goto out;
> +
> + if (args->flags & DLM_LKF_QUECVT &&
> + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
> + goto out;
> +
> + rv = -EBUSY;
> + if (lkb->lkb_status != DLM_LKSTS_GRANTED)
> + goto out;
> + }
> +
> + lkb->lkb_exflags = args->flags;
> + lkb->lkb_sbflags = 0;
> + lkb->lkb_astaddr = args->astaddr;
> + lkb->lkb_astparam = args->astparam;
> + lkb->lkb_bastaddr = args->bastaddr;
> + lkb->lkb_rqmode = args->mode;
> + lkb->lkb_lksb = args->lksb;
> + lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
> + lkb->lkb_ownpid = (int) current->pid;
> +
> + rv = 0;
> + if (!args->range)
> + goto out;
> +
> + if (!lkb->lkb_range) {
> + rv = -ENOMEM;
> + lkb->lkb_range = allocate_range(ls);
> + if (!lkb->lkb_range)
> + goto out;
> + /* This is needed for conversions that contain ranges
> + where the original lock didn't but it's harmless for
> + new locks too. */
> + lkb->lkb_range[GR_RANGE_START] = 0LL;
> + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
> + }
> +
> + lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start;
> + lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end;
> + lkb->lkb_flags |= DLM_IFL_RANGE;
> + rv = 0;
> + out:
> + return rv;
> +}
> +
> +int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
> +{
> + int rv = -EINVAL;
> +
> + if (lkb->lkb_flags & DLM_IFL_MSTCPY)
> + goto out;
> +
> + if (args->flags & DLM_LKF_CANCEL &&
> + lkb->lkb_status == DLM_LKSTS_GRANTED)
> + goto out;
> +
> + if (!(args->flags & DLM_LKF_CANCEL) &&
> + lkb->lkb_status != DLM_LKSTS_GRANTED)
> + goto out;
> +
> + rv = -EBUSY;
> + if (lkb->lkb_wait_type)
> + goto out;
> +
> + lkb->lkb_exflags = args->flags;
> + lkb->lkb_sbflags = 0;
> + lkb->lkb_astparam = args->astparam;
> + rv = 0;
> + out:
> + return rv;
> +}
> +
> +/*
> + * Four stage 2 varieties:
> + * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
> + */
> +
> +static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
> + int len, struct dlm_args *args)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + error = validate_lock_args(ls, lkb, args);
> + if (error)
> + goto out;
> +
> + error = find_rsb(ls, name, len, R_CREATE, &r);
> + if (error)
> + goto out;
> +
> + lock_rsb(r);
> +
> + attach_lkb(r, lkb);
> + error = _request_lock(r, lkb);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> +
> + lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
> + out:
> + return error;
> +}
> +
> +static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + error = validate_lock_args(ls, lkb, args);
> + if (error)
> + goto out;
> +
> + error = _convert_lock(r, lkb);
> + out:
> + unlock_rsb(r);
> + put_rsb(r);
> + return error;
> +}
> +
> +static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + error = validate_unlock_args(lkb, args);
> + if (error)
> + goto out;
> +
> + error = _unlock_lock(r, lkb);
> + out:
> + unlock_rsb(r);
> + put_rsb(r);
> + return error;
> +}
> +
> +static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_args *args)
> +{
> + struct dlm_rsb *r;
> + int error;
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + error = validate_unlock_args(lkb, args);
> + if (error)
> + goto out;
> +
> + error = _cancel_lock(r, lkb);
> + out:
> + unlock_rsb(r);
> + put_rsb(r);
> + return error;
> +}
> +
> +/*
> + * Four stage 3 varieties:
> + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
> + */
> +
> +/* add a new lkb to a possibly new rsb, called by requesting process */
> +
> +static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error;
> +
> + /* set_master: sets lkb nodeid from r */
> +
> + error = set_master(r, lkb);
> + if (error < 0)
> + goto out;
> + if (error) {
> + error = 0;
> + goto out;
> + }
> +
> + if (is_remote(r))
> + /* receive_request() calls do_request() on remote node */
> + error = send_request(r, lkb);
> + else
> + error = do_request(r, lkb);
> + out:
> + return error;
> +}
> +
> +/* change some property of an existing lkb, e.g. mode, range */
> +
> +static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error;
> +
> + if (is_remote(r))
> + /* receive_convert() calls do_convert() on remote node */
> + error = send_convert(r, lkb);
> + else
> + error = do_convert(r, lkb);
> +
> + return error;
> +}
> +
> +/* remove an existing lkb from the granted queue */
> +
> +static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error;
> +
> + if (is_remote(r))
> + /* receive_unlock() calls call do_unlock() on remote node */
> + error = send_unlock(r, lkb);
> + else
> + error = do_unlock(r, lkb);
> +
> + return error;
> +}
> +
> +/* remove an existing lkb from the convert or wait queue */
> +
> +static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error;
> +
> + if (is_remote(r))
> + /* receive_cancel() calls do_cancel() on remote node */
> + error = send_cancel(r, lkb);
> + else
> + error = do_cancel(r, lkb);
> +
> + return error;
> +}
> +
> +/* lkb is master or local copy */
> +
> +static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int b;
> +
> + /* b=1 lvb returned to caller
> + b=0 lvb written to rsb or invalidated
> + b=-1 do nothing */
> +
> + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
> +
> + if (b == 1) {
> + if (!lkb->lkb_lvbptr)
> + return;
> +
> + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> + return;
> +
> + if (!r->res_lvbptr)
> + return;
> +
> + memcpy(lkb->lkb_lvbptr, r->res_lvbptr, DLM_LVB_LEN);
> + lkb->lkb_lvbseq = r->res_lvbseq;
> +
> + } else if (b == 0) {
> + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
> + set_bit(RESFL_VALNOTVALID, &r->res_flags);
> + return;
> + }
> +
> + if (!lkb->lkb_lvbptr)
> + return;
> +
> + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> + return;
> +
> + if (!r->res_lvbptr)
> + r->res_lvbptr = allocate_lvb(r->res_ls);
> +
> + if (!r->res_lvbptr)
> + return;
> +
> + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
> + r->res_lvbseq++;
> + lkb->lkb_lvbseq = r->res_lvbseq;
> + clear_bit(RESFL_VALNOTVALID, &r->res_flags);
> + }
> +
> + if (test_bit(RESFL_VALNOTVALID, &r->res_flags))
> + lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
> +}
> +
> +static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + if (lkb->lkb_grmode < DLM_LOCK_PW)
> + return;
> +
> + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
> + set_bit(RESFL_VALNOTVALID, &r->res_flags);
> + return;
> + }
> +
> + if (!lkb->lkb_lvbptr)
> + return;
> +
> + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> + return;
> +
> + if (!r->res_lvbptr)
> + r->res_lvbptr = allocate_lvb(r->res_ls);
> +
> + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
> + r->res_lvbseq++;
> + clear_bit(RESFL_VALNOTVALID, &r->res_flags);
> +}
> +
> +/* lkb is process copy (pc) */
> +
> +static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + int b;
> +
> + if (!lkb->lkb_lvbptr)
> + return;
> +
> + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> + return;
> +
> + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
> + if (b == 1) {
> + memcpy(lkb->lkb_lvbptr, ms->m_lvb, DLM_LVB_LEN);
> + lkb->lkb_lvbseq = ms->m_lvbseq;
> + }
> +}
> +
> +/* Manipulate lkb's on rsb's convert/granted/waiting queues
> + remove_lock -- used for unlock, removes lkb from granted
> + revert_lock -- used for cancel, moves lkb from convert to granted
> + grant_lock -- used for request and convert, adds lkb to granted or
> + moves lkb from convert or waiting to granted
> +
> + Each of these is used for master or local copy lkb's. There is
> + also a _pc() variation used to make the corresponding change on
> + a process copy (pc) lkb. */
> +
> +static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + del_lkb(r, lkb);
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + /* this unhold undoes the original ref from create_lkb()
> + so this leads to the lkb being freed */
> + unhold_lkb(lkb);
> +}
> +
> +static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + set_lvb_unlock(r, lkb);
> + _remove_lock(r, lkb);
> +}
> +
> +static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + _remove_lock(r, lkb);
> +}
> +
> +static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + lkb->lkb_rqmode = DLM_LOCK_IV;
> +
> + switch (lkb->lkb_status) {
> + case DLM_LKSTS_CONVERT:
> + move_lkb(r, lkb, DLM_LKSTS_GRANTED);
> + break;
> + case DLM_LKSTS_WAITING:
> + del_lkb(r, lkb);
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + /* this unhold undoes the original ref from create_lkb()
> + so this leads to the lkb being freed */
> + unhold_lkb(lkb);
> + break;
> + default:
> + log_print("invalid status for revert %d", lkb->lkb_status);
> + }
> +}
> +
> +static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + revert_lock(r, lkb);
> +}
> +
> +static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + if (lkb->lkb_grmode != lkb->lkb_rqmode) {
> + lkb->lkb_grmode = lkb->lkb_rqmode;
> + if (lkb->lkb_status)
> + move_lkb(r, lkb, DLM_LKSTS_GRANTED);
> + else
> + add_lkb(r, lkb, DLM_LKSTS_GRANTED);
> + }
> +
> + lkb->lkb_rqmode = DLM_LOCK_IV;
> +
> + if (lkb->lkb_range) {
> + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START];
> + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END];
> + }
> +}
> +
> +static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + set_lvb_lock(r, lkb);
> + _grant_lock(r, lkb);
> + lkb->lkb_highbast = 0;
> +}
> +
> +static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + set_lvb_lock_pc(r, lkb, ms);
> + _grant_lock(r, lkb);
> +}
> +
> +/* called by grant_pending_locks() which means an async grant message must
> + be sent to the requesting node in addition to granting the lock if the
> + lkb belongs to a remote node. */
> +
> +static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + grant_lock(r, lkb);
> + if (is_master_copy(lkb))
> + send_grant(r, lkb);
> + else
> + queue_cast(r, lkb, 0);
> +}
> +
> +static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
> +{
> + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
> + lkb_statequeue);
> + if (lkb->lkb_id == first->lkb_id)
> + return TRUE;
> +
> + return FALSE;
> +}
> +
> +/*
> + * Return 1 if the locks' ranges overlap
> + * If the lkb has no range then it is assumed to cover 0-ffffffff.ffffffff
> + */
> +
> +static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2)
> +{
> + if (!lkb1->lkb_range || !lkb2->lkb_range)
> + return TRUE;
> +
> + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] ||
> + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END])
> + return FALSE;
> +
> + return TRUE;
> +}
> +
> +/*
> + * Check if the given lkb conflicts with another lkb on the queue.
> + */
> +
> +static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
> +{
> + struct dlm_lkb *this;
> +
> + list_for_each_entry(this, head, lkb_statequeue) {
> + if (this == lkb)
> + continue;
> + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb))
> + return TRUE;
> + }
> + return FALSE;
> +}
> +
> +/*
> + * "A conversion deadlock arises with a pair of lock requests in the converting
> + * queue for one resource. The granted mode of each lock blocks the requested
> + * mode of the other lock."
> + *
> + * Part 2: if the granted mode of lkb is preventing the first lkb in the
> + * convert queue from being granted, then demote lkb (set grmode to NL).
> + * This second form requires that we check for conv-deadlk even when
> + * now == 0 in _can_be_granted().
> + *
> + * Example:
> + * Granted Queue: empty
> + * Convert Queue: NL->EX (first lock)
> + * PR->EX (second lock)
> + *
> + * The first lock can't be granted because of the granted mode of the second
> + * lock and the second lock can't be granted because it's not first in the
> + * list. We demote the granted mode of the second lock (the lkb passed to this
> + * function).
> + *
> + * After the resolution, the "grant pending" function needs to go back and try
> + * to grant locks on the convert queue again since the first lock can now be
> + * granted.
> + */
> +
> +static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
> +{
> + struct dlm_lkb *this, *first = NULL, *self = NULL;
> +
> + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
> + if (!first)
> + first = this;
> + if (this == lkb) {
> + self = lkb;
> + continue;
> + }
> +
> + if (!ranges_overlap(lkb, this))
> + continue;
> +
> + if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
> + return TRUE;
> + }
> +
> + /* if lkb is on the convert queue and is preventing the first
> + from being granted, then there's deadlock and we demote lkb.
> + multiple converting locks may need to do this before the first
> + converting lock can be granted. */
> +
> + if (self && self != first) {
> + if (!modes_compat(lkb, first) &&
> + !queue_conflict(&rsb->res_grantqueue, first))
> + return TRUE;
> + }
> +
> + return FALSE;
> +}
> +
> +/*
> + * Return 1 if the lock can be granted, 0 otherwise.
> + * Also detect and resolve conversion deadlocks.
> + *
> + * lkb is the lock to be granted
> + *
> + * now is 1 if the function is being called in the context of the
> + * immediate request, it is 0 if called later, after the lock has been
> + * queued.
> + *
> + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
> + */
> +
> +static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
> +{
> + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
> +
> + /*
> + * 6-10: Version 5.4 introduced an option to address the phenomenon of
> + * a new request for a NL mode lock being blocked.
> + *
> + * 6-11: If the optional EXPEDITE flag is used with the new NL mode
> + * request, then it would be granted. In essence, the use of this flag
> + * tells the Lock Manager to expedite theis request by not considering
> + * what may be in the CONVERTING or WAITING queues... As of this
> + * writing, the EXPEDITE flag can be used only with new requests for NL
> + * mode locks. This flag is not valid for conversion requests.
> + *
> + * A shortcut. Earlier checks return an error if EXPEDITE is used in a
> + * conversion or used with a non-NL requested mode. We also know an
> + * EXPEDITE request is always granted immediately, so now must always
> + * be 1. The full condition to grant an expedite request: (now &&
> + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
> + * therefore be shortened to just checking the flag.
> + */
> +
> + if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
> + return TRUE;
> +
> + /*
> + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
> + * added to the remaining conditions.
> + */
> +
> + if (queue_conflict(&r->res_grantqueue, lkb))
> + goto out;
> +
> + /*
> + * 6-3: By default, a conversion request is immediately granted if the
> + * requested mode is compatible with the modes of all other granted
> + * locks
> + */
> +
> + if (queue_conflict(&r->res_convertqueue, lkb))
> + goto out;
> +
> + /*
> + * 6-5: But the default algorithm for deciding whether to grant or
> + * queue conversion requests does not by itself guarantee that such
> + * requests are serviced on a "first come first serve" basis. This, in
> + * turn, can lead to a phenomenon known as "indefinate postponement".
> + *
> + * 6-7: This issue is dealt with by using the optional QUECVT flag with
> + * the system service employed to request a lock conversion. This flag
> + * forces certain conversion requests to be queued, even if they are
> + * compatible with the granted modes of other locks on the same
> + * resource. Thus, the use of this flag results in conversion requests
> + * being ordered on a "first come first servce" basis.
> + *
> + * DCT: This condition is all about new conversions being able to occur
> + * "in place" while the lock remains on the granted queue (assuming
> + * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
> + * doesn't _have_ to go onto the convert queue where it's processed in
> + * order. The "now" variable is necessary to distinguish converts
> + * being received and processed for the first time now, because once a
> + * convert is moved to the conversion queue the condition below applies
> + * requiring fifo granting.
> + */
> +
> + if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
> + return TRUE;
> +
> + /*
> + * When using range locks the NOORDER flag is set to avoid the standard
> + * vms rules on grant order.
> + */
> +
> + if (lkb->lkb_exflags & DLM_LKF_NOORDER)
> + return TRUE;
> +
> + /*
> + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
> + * granted until all other conversion requests ahead of it are granted
> + * and/or canceled.
> + */
> +
> + if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
> + return TRUE;
> +
> + /*
> + * 6-4: By default, a new request is immediately granted only if all
> + * three of the following conditions are satisfied when the request is
> + * issued:
> + * - The queue of ungranted conversion requests for the resource is
> + * empty.
> + * - The queue of ungranted new requests for the resource is empty.
> + * - The mode of the new request is compatible with the most
> + * restrictive mode of all granted locks on the resource.
> + */
> +
> + if (now && !conv && list_empty(&r->res_convertqueue) &&
> + list_empty(&r->res_waitqueue))
> + return TRUE;
> +
> + /*
> + * 6-4: Once a lock request is in the queue of ungranted new requests,
> + * it cannot be granted until the queue of ungranted conversion
> + * requests is empty, all ungranted new requests ahead of it are
> + * granted and/or canceled, and it is compatible with the granted mode
> + * of the most restrictive lock granted on the resource.
> + */
> +
> + if (!now && !conv && list_empty(&r->res_convertqueue) &&
> + first_in_list(lkb, &r->res_waitqueue))
> + return TRUE;
> +
> + out:
> + /*
> + * The following, enabled by CONVDEADLK, departs from VMS.
> + */
> +
> + if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
> + conversion_deadlock_detect(r, lkb)) {
> + lkb->lkb_grmode = DLM_LOCK_NL;
> + lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
> + }
> +
> + return FALSE;
> +}
> +
> +/*
> + * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
> + * simple way to provide a big optimization to applications that can use them.
> + */
> +
> +static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
> +{
> + uint32_t flags = lkb->lkb_exflags;
> + int rv;
> + int8_t alt = 0, rqmode = lkb->lkb_rqmode;
> +
> + rv = _can_be_granted(r, lkb, now);
> + if (rv)
> + goto out;
> +
> + if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
> + goto out;
> +
> + if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
> + alt = DLM_LOCK_PR;
> + else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
> + alt = DLM_LOCK_CW;
> +
> + if (alt) {
> + lkb->lkb_rqmode = alt;
> + rv = _can_be_granted(r, lkb, now);
> + if (rv)
> + lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
> + else
> + lkb->lkb_rqmode = rqmode;
> + }
> + out:
> + return rv;
> +}
> +
> +static int grant_pending_convert(struct dlm_rsb *r, int high)
> +{
> + struct dlm_lkb *lkb, *s;
> + int hi, demoted, quit, grant_restart, demote_restart;
> +
> + quit = 0;
> + restart:
> + grant_restart = 0;
> + demote_restart = 0;
> + hi = DLM_LOCK_IV;
> +
> + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
> + demoted = is_demoted(lkb);
> + if (can_be_granted(r, lkb, FALSE)) {
> + grant_lock_pending(r, lkb);
> + grant_restart = 1;
> + } else {
> + hi = MAX(lkb->lkb_rqmode, hi);
> + if (!demoted && is_demoted(lkb))
> + demote_restart = 1;
> + }
> + }
> +
> + if (grant_restart)
> + goto restart;
> + if (demote_restart && !quit) {
> + quit = 1;
> + goto restart;
> + }
> +
> + return MAX(high, hi);
> +}
> +
> +static int grant_pending_wait(struct dlm_rsb *r, int high)
> +{
> + struct dlm_lkb *lkb, *s;
> +
> + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
> + if (can_be_granted(r, lkb, FALSE))
> + grant_lock_pending(r, lkb);
> + else
> + high = MAX(lkb->lkb_rqmode, high);
> + }
> +
> + return high;
> +}
> +
> +static int grant_pending_locks(struct dlm_rsb *r)
> +{
> + struct dlm_lkb *lkb, *s;
> + int high = DLM_LOCK_IV;
> +
> + DLM_ASSERT(is_master(r), dlm_print_rsb(r););
> +
> + high = grant_pending_convert(r, high);
> + high = grant_pending_wait(r, high);
> +
> + if (high == DLM_LOCK_IV)
> + return 0;
> +
> + /*
> + * If there are locks left on the wait/convert queue then send blocking
> + * ASTs to granted locks based on the largest requested mode (high)
> + * found above. This can generate spurious blocking ASTs for range
> + * locks. FIXME: highbast < high comparison not valid for PR/CW.
> + */
> +
> + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
> + if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
> + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
> + queue_bast(r, lkb, high);
> + lkb->lkb_highbast = high;
> + }
> + }
> +
> + return 0;
> +}
> +
> +static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
> + struct dlm_lkb *lkb)
> +{
> + struct dlm_lkb *gr;
> +
> + list_for_each_entry(gr, head, lkb_statequeue) {
> + if (gr->lkb_bastaddr &&
> + gr->lkb_highbast < lkb->lkb_rqmode &&
> + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) {
> + queue_bast(r, gr, lkb->lkb_rqmode);
> + gr->lkb_highbast = lkb->lkb_rqmode;
> + }
> + }
> +}
> +
> +static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + send_bast_queue(r, &r->res_grantqueue, lkb);
> +}
> +
> +static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + send_bast_queue(r, &r->res_grantqueue, lkb);
> + send_bast_queue(r, &r->res_convertqueue, lkb);
> +}
> +
> +/*
> + * Four stage 4 varieties:
> + * do_request(), do_convert(), do_unlock(), do_cancel()
> + * These are called on the master node for the given lock and
> + * from the central locking logic.
> + */
> +
> +static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error = 0;
> +
> + if (can_be_granted(r, lkb, TRUE)) {
> + grant_lock(r, lkb);
> + queue_cast(r, lkb, 0);
> + goto out;
> + }
> +
> + if (can_be_queued(lkb)) {
> + error = -EINPROGRESS;
> + add_lkb(r, lkb, DLM_LKSTS_WAITING);
> + send_blocking_asts(r, lkb);
> + goto out;
> + }
> +
> + error = -EAGAIN;
> + if (force_blocking_asts(lkb))
> + send_blocking_asts_all(r, lkb);
> + queue_cast(r, lkb, -EAGAIN);
> +
> + out:
> + return error;
> +}
> +
> +static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + int error = 0;
> +
> + /* changing an existing lock may allow others to be granted */
> +
> + if (can_be_granted(r, lkb, TRUE)) {
> + grant_lock(r, lkb);
> + queue_cast(r, lkb, 0);
> + grant_pending_locks(r);
> + goto out;
> + }
> +
> + if (can_be_queued(lkb)) {
> + if (is_demoted(lkb))
> + grant_pending_locks(r);
> + error = -EINPROGRESS;
> + del_lkb(r, lkb);
> + add_lkb(r, lkb, DLM_LKSTS_CONVERT);
> + send_blocking_asts(r, lkb);
> + goto out;
> + }
> +
> + error = -EAGAIN;
> + if (force_blocking_asts(lkb))
> + send_blocking_asts_all(r, lkb);
> + queue_cast(r, lkb, -EAGAIN);
> +
> + out:
> + return error;
> +}
> +
> +static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + remove_lock(r, lkb);
> + queue_cast(r, lkb, -DLM_EUNLOCK);
> + grant_pending_locks(r);
> + return -DLM_EUNLOCK;
> +}
> +
> +static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + revert_lock(r, lkb);
> + queue_cast(r, lkb, -DLM_ECANCEL);
> + grant_pending_locks(r);
> + return -DLM_ECANCEL;
> +}
> +
> +
> +/*
> + * send/receive routines for remote operations and replies
> + *
> + * send_args
> + * send_common
> + * send_request receive_request
> + * send_convert receive_convert
> + * send_unlock receive_unlock
> + * send_cancel receive_cancel
> + * send_grant receive_grant
> + * send_bast receive_bast
> + * send_lookup receive_lookup
> + * send_remove receive_remove
> + *
> + * send_common_reply
> + * receive_request_reply send_request_reply
> + * receive_convert_reply send_convert_reply
> + * receive_unlock_reply send_unlock_reply
> + * receive_cancel_reply send_cancel_reply
> + * receive_lookup_reply send_lookup_reply
> + */
> +
> +static int create_message(struct dlm_rsb *r, int to_nodeid, int mstype,
> + struct dlm_message **ms_ret, struct dlm_mhandle **mh_ret)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + char *mb;
> + int mb_len = sizeof(struct dlm_message);
> +
> + if (mstype == DLM_MSG_REQUEST ||
> + mstype == DLM_MSG_LOOKUP ||
> + mstype == DLM_MSG_REMOVE)
> + mb_len += r->res_length;
> +
> + /* get_buffer gives us a message handle (mh) that we need to
> + pass into lowcomms_commit and a message buffer (mb) that we
> + write our data into */
> +
> + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
> + if (!mh)
> + return -ENOBUFS;
> +
> + memset(mb, 0, mb_len);
> +
> + ms = (struct dlm_message *) mb;
> +
> + ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
> + ms->m_header.h_lockspace = r->res_ls->ls_global_id;
> + ms->m_header.h_nodeid = dlm_our_nodeid();
> + ms->m_header.h_length = mb_len;
> + ms->m_header.h_cmd = DLM_MSG;
> +
> + ms->m_type = mstype;
> +
> + *mh_ret = mh;
> + *ms_ret = ms;
> + return 0;
> +}
> +
> +static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
> +{
> + dlm_message_out(ms);
> + dlm_lowcomms_commit_buffer(mh);
> + return 0;
> +}
> +
> +static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + ms->m_nodeid = lkb->lkb_nodeid;
> + ms->m_pid = lkb->lkb_ownpid;
> + ms->m_lkid = lkb->lkb_id;
> + ms->m_remid = lkb->lkb_remid;
> + ms->m_exflags = lkb->lkb_exflags;
> + ms->m_sbflags = lkb->lkb_sbflags;
> + ms->m_flags = lkb->lkb_flags;
> + ms->m_lvbseq = lkb->lkb_lvbseq;
> + ms->m_status = lkb->lkb_status;
> + ms->m_grmode = lkb->lkb_grmode;
> + ms->m_rqmode = lkb->lkb_rqmode;
> +
> + /* m_result and m_bastmode are set from function args,
> + not from lkb fields */
> +
> + if (lkb->lkb_bastaddr)
> + ms->m_asts |= AST_BAST;
> + if (lkb->lkb_astaddr)
> + ms->m_asts |= AST_COMP;
> +
> + if (lkb->lkb_range) {
> + ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START];
> + ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END];
> + }
> +
> + if (lkb->lkb_lvbptr)
> + memcpy(ms->m_lvb, lkb->lkb_lvbptr, DLM_LVB_LEN);
> +
> + if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
> + memcpy(ms->m_name, r->res_name, r->res_length);
> +}
> +
> +static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + add_to_waiters(lkb, mstype);
> +
> + to_nodeid = r->res_nodeid;
> +
> + error = create_message(r, to_nodeid, mstype, &ms, &mh);
> + if (error)
> + goto fail;
> +
> + send_args(r, lkb, ms);
> +
> + error = send_message(mh, ms);
> + if (error)
> + goto fail;
> + return 0;
> +
> + fail:
> + remove_from_waiters(lkb);
> + return error;
> +}
> +
> +static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + return send_common(r, lkb, DLM_MSG_REQUEST);
> +}
> +
> +static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + return send_common(r, lkb, DLM_MSG_CONVERT);
> +}
> +
> +/* FIXME: if this lkb is the only lock we hold on the rsb, then set
> + MASTER_UNCERTAIN to force the next request on the rsb to confirm
> + that the master is still correct. */
> +
> +static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + return send_common(r, lkb, DLM_MSG_UNLOCK);
> +}
> +
> +static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + return send_common(r, lkb, DLM_MSG_CANCEL);
> +}
> +
> +static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + to_nodeid = lkb->lkb_nodeid;
> +
> + error = create_message(r, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
> + if (error)
> + goto out;
> +
> + send_args(r, lkb, ms);
> +
> + ms->m_result = 0;
> +
> + error = send_message(mh, ms);
> + out:
> + return error;
> +}
> +
> +static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + to_nodeid = lkb->lkb_nodeid;
> +
> + error = create_message(r, to_nodeid, DLM_MSG_BAST, &ms, &mh);
> + if (error)
> + goto out;
> +
> + send_args(r, lkb, ms);
> +
> + ms->m_bastmode = mode;
> +
> + error = send_message(mh, ms);
> + out:
> + return error;
> +}
> +
> +static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + add_to_waiters(lkb, DLM_MSG_LOOKUP);
> +
> + to_nodeid = dlm_dir_nodeid(r);
> +
> + error = create_message(r, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
> + if (error)
> + goto fail;
> +
> + send_args(r, lkb, ms);
> +
> + error = send_message(mh, ms);
> + if (error)
> + goto fail;
> + return 0;
> +
> + fail:
> + remove_from_waiters(lkb);
> + return error;
> +}
> +
> +static int send_remove(struct dlm_rsb *r)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + to_nodeid = dlm_dir_nodeid(r);
> +
> + error = create_message(r, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
> + if (error)
> + goto out;
> +
> + memcpy(ms->m_name, r->res_name, r->res_length);
> +
> + error = send_message(mh, ms);
> + out:
> + return error;
> +}
> +
> +static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
> + int mstype, int rv)
> +{
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int to_nodeid, error;
> +
> + to_nodeid = lkb->lkb_nodeid;
> +
> + error = create_message(r, to_nodeid, mstype, &ms, &mh);
> + if (error)
> + goto out;
> +
> + send_args(r, lkb, ms);
> +
> + ms->m_result = rv;
> +
> + error = send_message(mh, ms);
> + out:
> + return error;
> +}
> +
> +static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
> +{
> + return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
> +}
> +
> +static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
> +{
> + return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
> +}
> +
> +static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
> +{
> + return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
> +}
> +
> +static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
> +{
> + return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
> +}
> +
> +static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
> + int ret_nodeid, int rv)
> +{
> + struct dlm_rsb *r = &ls->ls_stub_rsb;
> + struct dlm_message *ms;
> + struct dlm_mhandle *mh;
> + int error, to_nodeid = ms_in->m_header.h_nodeid;
> +
> + error = create_message(r, to_nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
> + if (error)
> + goto out;
> +
> + ms->m_lkid = ms_in->m_lkid;
> + ms->m_result = rv;
> + ms->m_nodeid = ret_nodeid;
> +
> + error = send_message(mh, ms);
> + out:
> + return error;
> +}
> +
> +/* which args we save from a received message depends heavily on the type
> + of message, unlike the send side where we can safely send everything about
> + the lkb for any type of message */
> +
> +static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
> +{
> + lkb->lkb_exflags = ms->m_exflags;
> + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
> + (ms->m_flags & 0x0000FFFF);
> +}
> +
> +static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
> +{
> + lkb->lkb_sbflags = ms->m_sbflags;
> + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
> + (ms->m_flags & 0x0000FFFF);
> +}
> +
> +static int receive_namelen(struct dlm_message *ms)
> +{
> + return (ms->m_header.h_length - sizeof(struct dlm_message));
> +}
> +
> +static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + if (lkb->lkb_flags & DLM_IFL_RANGE) {
> + if (!lkb->lkb_range)
> + lkb->lkb_range = allocate_range(ls);
> + if (!lkb->lkb_range)
> + return -ENOMEM;
> + lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0];
> + lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1];
> + }
> + return 0;
> +}
> +
> +static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
> + if (!lkb->lkb_lvbptr)
> + lkb->lkb_lvbptr = allocate_lvb(ls);
> + if (!lkb->lkb_lvbptr)
> + return -ENOMEM;
> + memcpy(lkb->lkb_lvbptr, ms->m_lvb, DLM_LVB_LEN);
> + }
> + return 0;
> +}
> +
> +static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + lkb->lkb_nodeid = ms->m_header.h_nodeid;
> + lkb->lkb_ownpid = ms->m_pid;
> + lkb->lkb_remid = ms->m_lkid;
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + lkb->lkb_rqmode = ms->m_rqmode;
> + lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
> + lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
> +
> + DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
> +
> + if (receive_range(ls, lkb, ms))
> + return -ENOMEM;
> +
> + if (receive_lvb(ls, lkb, ms))
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
> + log_error(ls, "convert_args nodeid %d %d lkid %x %x",
> + lkb->lkb_nodeid, ms->m_header.h_nodeid,
> + lkb->lkb_id, lkb->lkb_remid);
> + return -EINVAL;
> + }
> +
> + if (!is_master_copy(lkb))
> + return -EINVAL;
> +
> + if (lkb->lkb_status != DLM_LKSTS_GRANTED)
> + return -EBUSY;
> +
> + if (receive_range(ls, lkb, ms))
> + return -ENOMEM;
> + if (lkb->lkb_range) {
> + lkb->lkb_range[GR_RANGE_START] = 0LL;
> + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL;
> + }
> +
> + if (receive_lvb(ls, lkb, ms))
> + return -ENOMEM;
> +
> + lkb->lkb_rqmode = ms->m_rqmode;
> + lkb->lkb_lvbseq = ms->m_lvbseq;
> +
> + return 0;
> +}
> +
> +static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + if (!is_master_copy(lkb))
> + return -EINVAL;
> + if (receive_lvb(ls, lkb, ms))
> + return -ENOMEM;
> + return 0;
> +}
> +
> +/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
> + uses to send a reply and that the remote end uses to process the reply. */
> +
> +static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb = &ls->ls_stub_lkb;
> + lkb->lkb_nodeid = ms->m_header.h_nodeid;
> + lkb->lkb_remid = ms->m_lkid;
> +}
> +
> +static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error, namelen;
> +
> + error = create_lkb(ls, &lkb);
> + if (error)
> + goto fail;
> +
> + receive_flags(lkb, ms);
> + lkb->lkb_flags |= DLM_IFL_MSTCPY;
> + error = receive_request_args(ls, lkb, ms);
> + if (error) {
> + put_lkb(lkb);
> + goto fail;
> + }
> +
> + namelen = receive_namelen(ms);
> +
> + error = find_rsb(ls, ms->m_name, namelen, R_MASTER, &r);
> + if (error) {
> + put_lkb(lkb);
> + goto fail;
> + }
> +
> + lock_rsb(r);
> +
> + attach_lkb(r, lkb);
> + error = do_request(r, lkb);
> + send_request_reply(r, lkb, error);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> +
> + if (error == -EINPROGRESS)
> + error = 0;
> + if (error)
> + put_lkb(lkb);
> + return;
> +
> + fail:
> + setup_stub_lkb(ls, ms);
> + send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
> +}
> +
> +static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error)
> + goto fail;
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + receive_flags(lkb, ms);
> + error = receive_convert_args(ls, lkb, ms);
> + if (error)
> + goto out;
> +
> + error = do_convert(r, lkb);
> + out:
> + send_convert_reply(r, lkb, error);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> + return;
> +
> + fail:
> + setup_stub_lkb(ls, ms);
> + send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
> +}
> +
> +static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error)
> + goto fail;
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + receive_flags(lkb, ms);
> + error = receive_unlock_args(ls, lkb, ms);
> + if (error)
> + goto out;
> +
> + error = do_unlock(r, lkb);
> + out:
> + send_unlock_reply(r, lkb, error);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> + return;
> +
> + fail:
> + setup_stub_lkb(ls, ms);
> + send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
> +}
> +
> +static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error)
> + goto fail;
> +
> + receive_flags(lkb, ms);
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + error = do_cancel(r, lkb);
> + send_cancel_reply(r, lkb, error);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> + return;
> +
> + fail:
> + setup_stub_lkb(ls, ms);
> + send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
> +}
> +
> +static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_grant no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + receive_flags_reply(lkb, ms);
> + grant_lock_pc(r, lkb, ms);
> + queue_cast(r, lkb, 0);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> +}
> +
> +static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_bast no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + r = lkb->lkb_resource;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + queue_bast(r, lkb, ms->m_bastmode);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> +}
> +
> +static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + int len, error, ret_nodeid, dir_nodeid, from_nodeid;
> +
> + from_nodeid = ms->m_header.h_nodeid;
> +
> + len = receive_namelen(ms);
> +
> + dir_nodeid = dlm_dir_name2nodeid(ls, ms->m_name, len);
> + if (dir_nodeid != dlm_our_nodeid()) {
> + log_error(ls, "lookup dir_nodeid %d from %d",
> + dir_nodeid, from_nodeid);
> + error = -EINVAL;
> + ret_nodeid = -1;
> + goto out;
> + }
> +
> + error = dlm_dir_lookup(ls, from_nodeid, ms->m_name, len, &ret_nodeid);
> + out:
> + send_lookup_reply(ls, ms, ret_nodeid, error);
> +}
> +
> +static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + int len, dir_nodeid, from_nodeid;
> +
> + from_nodeid = ms->m_header.h_nodeid;
> +
> + len = receive_namelen(ms);
> +
> + dir_nodeid = dlm_dir_name2nodeid(ls, ms->m_name, len);
> + if (dir_nodeid != dlm_our_nodeid()) {
> + log_error(ls, "remove dir entry dir_nodeid %d from %d",
> + dir_nodeid, from_nodeid);
> + return;
> + }
> +
> + dlm_dir_remove_entry(ls, from_nodeid, ms->m_name, len);
> +}
> +
> +static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_request_reply no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + error = remove_from_waiters(lkb);
> + if (error) {
> + log_error(ls, "receive_request_reply not on waiters");
> + goto out;
> + }
> +
> + /* this is the value returned from do_request() on the master */
> + error = ms->m_result;
> +
> + r = lkb->lkb_resource;
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + switch (error) {
> + case -EAGAIN:
> + /* request would block (be queued) on remote master;
> + the unhold undoes the original ref from create_lkb()
> + so it leads to the lkb being freed */
> + queue_cast(r, lkb, -EAGAIN);
> + confirm_master(r, -EAGAIN);
> + unhold_lkb(lkb);
> + break;
> +
> + case -EINPROGRESS:
> + case 0:
> + /* request was queued or granted on remote master */
> + receive_flags_reply(lkb, ms);
> + lkb->lkb_remid = ms->m_lkid;
> + if (error)
> + add_lkb(r, lkb, DLM_LKSTS_WAITING);
> + else {
> + grant_lock_pc(r, lkb, ms);
> + queue_cast(r, lkb, 0);
> + }
> + confirm_master(r, error);
> + break;
> +
> + case -ENOENT:
> + case -ENOTBLK:
> + /* find_rsb failed to find rsb or rsb wasn't master */
> +
> + DLM_ASSERT(test_bit(RESFL_MASTER_WAIT, &r->res_flags),
> + log_print("receive_request_reply error %d", error);
> + dlm_print_lkb(lkb);
> + dlm_print_rsb(r););
> +
> + confirm_master(r, error);
> + lkb->lkb_nodeid = -1;
> + _request_lock(r, lkb);
> + break;
> +
> + default:
> + log_error(ls, "receive_request_reply unknown error %d", error);
> + }
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + out:
> + put_lkb(lkb);
> +}
> +
> +static void _receive_convert_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + struct dlm_rsb *r = lkb->lkb_resource;
> + int error = ms->m_result;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + /* this is the value returned from do_convert() on the master */
> +
> + switch (error) {
> + case -EAGAIN:
> + /* convert would block (be queued) on remote master */
> + queue_cast(r, lkb, -EAGAIN);
> + break;
> +
> + case -EINPROGRESS:
> + /* convert was queued on remote master */
> + del_lkb(r, lkb);
> + add_lkb(r, lkb, DLM_LKSTS_CONVERT);
> + break;
> +
> + case 0:
> + /* convert was granted on remote master */
> + receive_flags_reply(lkb, ms);
> + grant_lock_pc(r, lkb, ms);
> + queue_cast(r, lkb, 0);
> + break;
> +
> + default:
> + log_error(ls, "receive_convert_reply unknown error %d", error);
> + }
> +
> + unlock_rsb(r);
> + put_rsb(r);
> +}
> +
> +static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_convert_reply no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + error = remove_from_waiters(lkb);
> + if (error) {
> + log_error(ls, "receive_convert_reply not on waiters");
> + goto out;
> + }
> +
> + _receive_convert_reply(ls, lkb, ms);
> + out:
> + put_lkb(lkb);
> +}
> +
> +static void _receive_unlock_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + struct dlm_rsb *r = lkb->lkb_resource;
> + int error = ms->m_result;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + /* this is the value returned from do_unlock() on the master */
> +
> + switch (error) {
> + case -DLM_EUNLOCK:
> + receive_flags_reply(lkb, ms);
> + remove_lock_pc(r, lkb);
> + queue_cast(r, lkb, -DLM_EUNLOCK);
> + break;
> + default:
> + log_error(ls, "receive_unlock_reply unknown error %d", error);
> + }
> +
> + unlock_rsb(r);
> + put_rsb(r);
> +}
> +
> +static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_unlock_reply no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + error = remove_from_waiters(lkb);
> + if (error) {
> + log_error(ls, "receive_unlock_reply not on waiters");
> + goto out;
> + }
> +
> + _receive_unlock_reply(ls, lkb, ms);
> + out:
> + put_lkb(lkb);
> +}
> +
> +static void _receive_cancel_reply(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_message *ms)
> +{
> + struct dlm_rsb *r = lkb->lkb_resource;
> + int error = ms->m_result;
> +
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + /* this is the value returned from do_cancel() on the master */
> +
> + switch (error) {
> + case -DLM_ECANCEL:
> + receive_flags_reply(lkb, ms);
> + revert_lock_pc(r, lkb);
> + queue_cast(r, lkb, -DLM_ECANCEL);
> + break;
> + default:
> + log_error(ls, "receive_cancel_reply unknown error %d", error);
> + }
> +
> + unlock_rsb(r);
> + put_rsb(r);
> +}
> +
> +static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + int error;
> +
> + error = find_lkb(ls, ms->m_remid, &lkb);
> + if (error) {
> + log_error(ls, "receive_cancel_reply no lkb");
> + return;
> + }
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + error = remove_from_waiters(lkb);
> + if (error) {
> + log_error(ls, "receive_cancel_reply not on waiters");
> + goto out;
> + }
> +
> + _receive_cancel_reply(ls, lkb, ms);
> + out:
> + put_lkb(lkb);
> +}
> +
> +static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error, ret_nodeid;
> +
> + error = find_lkb(ls, ms->m_lkid, &lkb);
> + if (error) {
> + log_error(ls, "receive_lookup_reply no lkb");
> + return;
> + }
> +
> + error = remove_from_waiters(lkb);
> + if (error) {
> + log_error(ls, "receive_lookup_reply not on waiters");
> + goto out;
> + }
> +
> + /* this is the value returned by dlm_dir_lookup on dir node
> + FIXME: will a non-zero error ever be returned? */
> + error = ms->m_result;
> +
> + r = lkb->lkb_resource;
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + ret_nodeid = ms->m_nodeid;
> + if (ret_nodeid == dlm_our_nodeid())
> + r->res_nodeid = ret_nodeid = 0;
> + else {
> + r->res_nodeid = ret_nodeid;
> + r->res_trial_lkid = lkb->lkb_id;
> + }
> +
> + _request_lock(r, lkb);
> +
> + if (!ret_nodeid)
> + confirm_master(r, 0);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + out:
> + put_lkb(lkb);
> +}
> +
> +int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
> +{
> + struct dlm_message *ms = (struct dlm_message *) hd;
> + struct dlm_ls *ls;
> + int error;
> +
> + if (!recovery)
> + dlm_message_in(ms);
> +
> + ls = dlm_find_lockspace_global(hd->h_lockspace);
> + if (!ls) {
> + log_print("drop message %d from %d for unknown lockspace %d",
> + ms->m_type, nodeid, hd->h_lockspace);
> + return -EINVAL;
> + }
> +
> + /* recovery may have just ended leaving a bunch of backed-up requests
> + in the requestqueue; wait while dlm_recoverd clears them */
> +
> + if (!recovery)
> + dlm_wait_requestqueue(ls);
> +
> + /* recovery may have just started while there were a bunch of
> + in-flight requests -- save them in requestqueue to be processed
> + after recovery. we can't let dlm_recvd block on the recovery
> + lock. if dlm_recoverd is calling this function to clear the
> + requestqueue, it needs to be interrupted (-EINTR) if another
> + recovery operation is starting. */
> +
> + while (1) {
> + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
> + if (!recovery)
> + dlm_add_requestqueue(ls, nodeid, hd);
> + error = -EINTR;
> + goto out;
> + }
> +
> + if (lock_recovery_try(ls))
> + break;
> + schedule();
> + }
> +
> + switch (ms->m_type) {
> +
> + /* messages sent to a master node */
> +
> + case DLM_MSG_REQUEST:
> + receive_request(ls, ms);
> + break;
> +
> + case DLM_MSG_CONVERT:
> + receive_convert(ls, ms);
> + break;
> +
> + case DLM_MSG_UNLOCK:
> + receive_unlock(ls, ms);
> + break;
> +
> + case DLM_MSG_CANCEL:
> + receive_cancel(ls, ms);
> + break;
> +
> + /* messages sent from a master node (replies to above) */
> +
> + case DLM_MSG_REQUEST_REPLY:
> + receive_request_reply(ls, ms);
> + break;
> +
> + case DLM_MSG_CONVERT_REPLY:
> + receive_convert_reply(ls, ms);
> + break;
> +
> + case DLM_MSG_UNLOCK_REPLY:
> + receive_unlock_reply(ls, ms);
> + break;
> +
> + case DLM_MSG_CANCEL_REPLY:
> + receive_cancel_reply(ls, ms);
> + break;
> +
> + /* messages sent from a master node (only two types of async msg) */
> +
> + case DLM_MSG_GRANT:
> + receive_grant(ls, ms);
> + break;
> +
> + case DLM_MSG_BAST:
> + receive_bast(ls, ms);
> + break;
> +
> + /* messages sent to a dir node */
> +
> + case DLM_MSG_LOOKUP:
> + receive_lookup(ls, ms);
> + break;
> +
> + case DLM_MSG_REMOVE:
> + receive_remove(ls, ms);
> + break;
> +
> + /* messages sent from a dir node (remove has no reply) */
> +
> + case DLM_MSG_LOOKUP_REPLY:
> + receive_lookup_reply(ls, ms);
> + break;
> +
> + default:
> + log_error(ls, "unknown message type %d", ms->m_type);
> + }
> +
> + unlock_recovery(ls);
> + out:
> + dlm_put_lockspace(ls);
> + dlm_astd_wake();
> + return 0;
> +}
> +
> +
> +/*
> + * Recovery related
> + */
> +
> +static int middle_conversion(struct dlm_lkb *lkb)
> +{
> + if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
> + (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
> + return TRUE;
> + return FALSE;
> +}
> +
> +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
> +{
> + if (middle_conversion(lkb)) {
> + hold_lkb(lkb);
> + ls->ls_stub_ms.m_result = -EINPROGRESS;
> + _remove_from_waiters(lkb);
> + _receive_convert_reply(ls, lkb, &ls->ls_stub_ms);
> +
> + /* Same special case as in receive_rcom_lock_args() */
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + set_bit(RESFL_RECOVER_CONVERT, &lkb->lkb_resource->res_flags);
> + unhold_lkb(lkb);
> +
> + } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
> + lkb->lkb_flags |= DLM_IFL_RESEND;
> +
> + } else if (lkb->lkb_rqmode < lkb->lkb_grmode) {
> + hold_lkb(lkb);
> + ls->ls_stub_ms.m_result = 0;
> + _remove_from_waiters(lkb);
> + _receive_convert_reply(ls, lkb, &ls->ls_stub_ms);
> + unhold_lkb(lkb);
> + }
> +}
> +
> +/* Recovery for locks that are waiting for replies from nodes that are now
> + gone. We can just complete unlocks and cancels by faking a reply from the
> + dead node. Requests and up-conversions we just flag to be resent after
> + recovery. Down-conversions can just be completed with a fake reply like
> + unlocks. Conversions between PR and CW need special attention. */
> +
> +void dlm_recover_waiters_pre(struct dlm_ls *ls)
> +{
> + struct dlm_lkb *lkb, *safe;
> +
> + down(&ls->ls_waiters_sem);
> +
> + list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
> + if (!dlm_is_removed(ls, lkb->lkb_nodeid))
> + continue;
> +
> + log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
> + lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
> +
> + switch (lkb->lkb_wait_type) {
> +
> + case DLM_MSG_REQUEST:
> + lkb->lkb_flags |= DLM_IFL_RESEND;
> + break;
> +
> + case DLM_MSG_CONVERT:
> + recover_convert_waiter(ls, lkb);
> + break;
> +
> + case DLM_MSG_UNLOCK:
> + hold_lkb(lkb);
> + ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
> + _remove_from_waiters(lkb);
> + _receive_unlock_reply(ls, lkb, &ls->ls_stub_ms);
> + put_lkb(lkb);
> + break;
> +
> + case DLM_MSG_CANCEL:
> + hold_lkb(lkb);
> + ls->ls_stub_ms.m_result = -DLM_ECANCEL;
> + _remove_from_waiters(lkb);
> + _receive_cancel_reply(ls, lkb, &ls->ls_stub_ms);
> + put_lkb(lkb);
> + break;
> +
> + case DLM_MSG_LOOKUP:
> + /* all outstanding lookups, regardless of dest.
> + will be resent after recovery is done */
> + break;
> +
> + default:
> + log_error(ls, "invalid lkb wait_type %d",
> + lkb->lkb_wait_type);
> + }
> + }
> + up(&ls->ls_waiters_sem);
> +}
> +
> +static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
> +{
> + struct dlm_lkb *lkb;
> + int rv = 0;
> +
> + down(&ls->ls_waiters_sem);
> + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
> + if (lkb->lkb_flags & DLM_IFL_RESEND) {
> + rv = lkb->lkb_wait_type;
> + _remove_from_waiters(lkb);
> + lkb->lkb_flags &= ~DLM_IFL_RESEND;
> + break;
> + }
> + }
> + up(&ls->ls_waiters_sem);
> +
> + if (!rv)
> + lkb = NULL;
> + *lkb_ret = lkb;
> + return rv;
> +}
> +
> +/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
> + master or dir-node for r. Processing the lkb may result in it being placed
> + back on waiters. */
> +
> +int dlm_recover_waiters_post(struct dlm_ls *ls)
> +{
> + struct dlm_lkb *lkb;
> + struct dlm_rsb *r;
> + int error = 0, mstype;
> +
> + while (1) {
> + if (!test_bit(LSFL_LS_RUN, &ls->ls_flags)) {
> + log_debug(ls, "recover_waiters_post aborted");
> + error = -EINTR;
> + break;
> + }
> +
> + mstype = remove_resend_waiter(ls, &lkb);
> + if (!mstype)
> + break;
> +
> + r = lkb->lkb_resource;
> +
> + log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
> + lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
> +
> + switch (mstype) {
> +
> + case DLM_MSG_LOOKUP:
> + case DLM_MSG_REQUEST:
> + hold_rsb(r);
> + lock_rsb(r);
> + _request_lock(r, lkb);
> + unlock_rsb(r);
> + put_rsb(r);
> + break;
> +
> + case DLM_MSG_CONVERT:
> + hold_rsb(r);
> + lock_rsb(r);
> + _convert_lock(r, lkb);
> + unlock_rsb(r);
> + put_rsb(r);
> + break;
> +
> + default:
> + log_error(ls, "recover_waiters_post type %d", mstype);
> + }
> + }
> +
> + return error;
> +}
> +
> +static int purge_queue(struct dlm_rsb *r, struct list_head *queue)
> +{
> + struct dlm_ls *ls = r->res_ls;
> + struct dlm_lkb *lkb, *safe;
> +
> + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
> + if (!is_master_copy(lkb))
> + continue;
> +
> + if (dlm_is_removed(ls, lkb->lkb_nodeid)) {
> + del_lkb(r, lkb);
> + /* this put should free the lkb */
> + if (!put_lkb(lkb))
> + log_error(ls, "purged lkb not released");
> + }
> + }
> + return 0;
> +}
> +
> +/*
> + * Get rid of locks held by nodes that are gone.
> + */
> +
> +int dlm_purge_locks(struct dlm_ls *ls)
> +{
> + struct dlm_rsb *r;
> +
> + log_debug(ls, "dlm_purge_locks");
> +
> + down_write(&ls->ls_root_sem);
> + list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + purge_queue(r, &r->res_grantqueue);
> + purge_queue(r, &r->res_convertqueue);
> + purge_queue(r, &r->res_waitqueue);
> +
> + unlock_rsb(r);
> + unhold_rsb(r);
> +
> + schedule();
> + }
> + up_write(&ls->ls_root_sem);
> +
> + return 0;
> +}
> +
> +int dlm_grant_after_purge(struct dlm_ls *ls)
> +{
> + struct dlm_rsb *r;
> + int i;
> +
> + for (i = 0; i < ls->ls_rsbtbl_size; i++) {
> + read_lock(&ls->ls_rsbtbl[i].lock);
> + list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
> + hold_rsb(r);
> + lock_rsb(r);
> + if (is_master(r))
> + grant_pending_locks(r);
> + unlock_rsb(r);
> + put_rsb(r);
> + }
> + read_unlock(&ls->ls_rsbtbl[i].lock);
> + }
> +
> + return 0;
> +}
> +
> +static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
> + uint32_t remid)
> +{
> + struct dlm_lkb *lkb;
> +
> + list_for_each_entry(lkb, head, lkb_statequeue) {
> + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
> + return lkb;
> + }
> + return NULL;
> +}
> +
> +static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
> + uint32_t remid)
> +{
> + struct dlm_lkb *lkb;
> +
> + lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
> + if (lkb)
> + return lkb;
> + lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
> + if (lkb)
> + return lkb;
> + lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
> + if (lkb)
> + return lkb;
> + return NULL;
> +}
> +
> +static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
> + struct dlm_rsb *r, struct dlm_rcom *rc)
> +{
> + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
> +
> + lkb->lkb_nodeid = rc->rc_header.h_nodeid;
> + lkb->lkb_ownpid = rl->rl_ownpid;
> + lkb->lkb_remid = rl->rl_lkid;
> + lkb->lkb_exflags = rl->rl_exflags;
> + lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
> + lkb->lkb_flags |= DLM_IFL_MSTCPY;
> + lkb->lkb_lvbseq = rl->rl_lvbseq;
> + lkb->lkb_rqmode = rl->rl_rqmode;
> + lkb->lkb_grmode = rl->rl_grmode;
> + /* don't set lkb_status because add_lkb wants to itself */
> +
> + lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
> + lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
> +
> + if (lkb->lkb_flags & DLM_IFL_RANGE) {
> + lkb->lkb_range = allocate_range(ls);
> + if (!lkb->lkb_range)
> + return -ENOMEM;
> + memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t));
> + }
> +
> + if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
> + lkb->lkb_lvbptr = allocate_lvb(ls);
> + if (!lkb->lkb_lvbptr)
> + return -ENOMEM;
> + memcpy(lkb->lkb_lvbptr, rl->rl_lvb, DLM_LVB_LEN);
> + }
> +
> + /* Conversions between PR and CW (middle modes) need special handling.
> + The real granted mode of these converting locks cannot be determined
> + until all locks have been rebuilt on the rsb (recover_conversion) */
> +
> + if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
> + rl->rl_status = DLM_LKSTS_CONVERT;
> + lkb->lkb_grmode = DLM_LOCK_IV;
> + set_bit(RESFL_RECOVER_CONVERT, &r->res_flags);
> + }
> +
> + return 0;
> +}
> +
> +/* This lkb may have been recovered in a previous aborted recovery so we need
> + to check if the rsb already has an lkb with the given remote nodeid/lkid.
> + If so we just send back a standard reply. If not, we create a new lkb with
> + the given values and send back our lkid. We send back our lkid by sending
> + back the rcom_lock struct we got but with the remid field filled in. */
> +
> +int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
> +{
> + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
> + struct dlm_rsb *r;
> + struct dlm_lkb *lkb;
> + int error;
> +
> + if (rl->rl_parent_lkid) {
> + error = -EOPNOTSUPP;
> + goto out;
> + }
> +
> + error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
> + if (error)
> + goto out;
> +
> + lock_rsb(r);
> +
> + lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
> + if (lkb) {
> + error = -EEXIST;
> + goto out_remid;
> + }
> +
> + error = create_lkb(ls, &lkb);
> + if (error)
> + goto out_unlock;
> +
> + error = receive_rcom_lock_args(ls, lkb, r, rc);
> + if (error) {
> + put_lkb(lkb);
> + goto out_unlock;
> + }
> +
> + attach_lkb(r, lkb);
> + add_lkb(r, lkb, rl->rl_status);
> + error = 0;
> +
> + out_remid:
> + /* this is the new value returned to the lock holder for
> + saving in its process-copy lkb */
> + rl->rl_remid = lkb->lkb_id;
> +
> + out_unlock:
> + unlock_rsb(r);
> + put_rsb(r);
> + out:
> + rl->rl_result = error;
> + return error;
> +}
> +
> +int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
> +{
> + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
> + struct dlm_rsb *r;
> + struct dlm_lkb *lkb;
> + int error;
> +
> + error = find_lkb(ls, rl->rl_lkid, &lkb);
> + if (error) {
> + log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
> + return error;
> + }
> +
> + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
> +
> + error = rl->rl_result;
> +
> + r = lkb->lkb_resource;
> + hold_rsb(r);
> + lock_rsb(r);
> +
> + switch (error) {
> + case -EEXIST:
> + log_debug(ls, "master copy exists %x", lkb->lkb_id);
> + /* fall through */
> + case 0:
> + lkb->lkb_remid = rl->rl_remid;
> + break;
> + default:
> + log_error(ls, "dlm_recover_process_copy unknown error %d %x",
> + error, lkb->lkb_id);
> + }
> +
> + /* an ack for dlm_recover_locks() which waits for replies from
> + all the locks it sends to new masters */
> + dlm_recovered_lock(r);
> +
> + unlock_rsb(r);
> + put_rsb(r);
> + put_lkb(lkb);
> +
> + return 0;
> +}
> +
> +
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Daniel Phillips writes:
> Hi Nikita,
>
> On Monday 25 April 2005 14:34, you wrote:
> > > +
> > > +static int is_remote(struct dlm_rsb *r)
> > > +{
> > > + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
> > > + return r->res_nodeid ? TRUE : FALSE;
> > > +}
> >
> > This can be simply
> >
> > return r->res_nodeid;
>
> Not quite the same. Perhaps you meant:
>
> return !!r->res_nodeid;
Strictly speaking yes (assuming TRUE is defined as 1), but name
is_remote() implies usages like
if (is_remote(r)) {
do_something();
}
in such contexts !! is not necessary.
>
> Regards,
>
> Daniel
Nikita.
On Monday 25 April 2005 18:27, Nikita Danilov wrote:
> > > > +
> > > > +static int is_remote(struct dlm_rsb *r)
> > > > +{
> > > > + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
> > > > + return r->res_nodeid ? TRUE : FALSE;
> > > > +}
> > >
> > > This can be simply
> > >
> > > return r->res_nodeid;
> >
> > Not quite the same. Perhaps you meant:
> >
> > return !!r->res_nodeid;
>
> Strictly speaking yes (assuming TRUE is defined as 1), but name
> is_remote() implies usages like
>
> if (is_remote(r)) {
> do_something();
> }
>
> in such contexts !! is not necessary.
Any objection to making it inline and let the compiler delete the redundant
code? The princple is: it's better to spell out "!!" when that's intended,
rather than build in a nasty surprise for later. The inline code will be
smaller than a function call anyway.
Regards,
Daniel
On Monday 25 April 2005 16:41, Jesper Juhl wrote:
> > +?????}
> > +
> > +?????if (!lkb->lkb_lvbptr)
> > +?????????????return;
>
> goto out;
>
> > +
> > +?????if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
> > +?????????????return;
>
> goto out;
>
> > +
> > +?????if (!r->res_lvbptr)
> > +?????????????r->res_lvbptr = allocate_lvb(r->res_ls);
> > +
> > +?????memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
> > +?????r->res_lvbseq++;
> > +?????clear_bit(RESFL_VALNOTVALID, &r->res_flags);
>
> out:
> ????????return;
>
> > +}
>
> A single return function exit point instead of multiple reduces the risk
> of errors when code is later modified.
> Applies to many other functions besides this one (and this one may not
> even be the best example, but hey, I wanted to make that comment, and
> this function was at hand).
Great comments on the whole, but this one is really well into the "matter of
taste" zone. Naked return vs goto return... either way is ugly. I prefer
the style that is two lines shorter and does not make my eyes do an extra
hop.
Regards,
Daniel
On Mon, Apr 25, 2005 at 02:54:58PM -0700, Steven Dake wrote:
> On Mon, 2005-04-25 at 09:58, David Teigland wrote:
> > The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
> > Creates lockspaces which give applications separate contexts/namespaces in
> > which to do their locking. Manages locks on resources' grant/convert/wait
> > queues. Sends and receives high level locking operations between nodes.
> > Delivers completion and blocking callbacks (ast's) to lock holders.
> > Manages the distributed directory that tracks the current master node for
> > each resource.
> >
>
> David
>
> Very positive there are some submissions relating to cluster kernel work
> for lkml to review.. good job..
>
> I have some questions on the implementation:
>
> It appears as though a particular processor is identified as the "lock
> master" or processor that maintains the state of the lock. So for
> example, if a processor wants to acquire a lock, it sends a reqeust to
> the lock master which either grants or rejects the request for the
> lock. What happens in the scenario that a lock master leaves the
> current configuration? This scneario is very likely in practice.
Of course, every time a node fails.
> How do you synchronize the membership events that occur with the kernel
> to kernel communication that takes place using SCTP?
SCTP isn't much different than TCP, so I'm not sure how that's relevant.
It's used primarily so we can take advantage of multi-homing when you have
redundant networks.
When the membership of a lockspace needs to change, whether adding or
removing a node, activity is suspended in that lockspace on all the nodes
using it. After all are suspended, the lockspace is then told (on all
lockspace members) what the new membership is. Recovery then takes place:
new masters are selected and waiting requests redirected.
> It appears from your patches there is some external (userland)
> application that maintains the current list of processors that qualify
> as "lock servers".
correct
> Is there then a dependence on external membership algorithms?
We simply require that the membership system is in agreement before the
lockspace is told what the new members are. The membership system
ultimately drives the lockspace membership and we can't have the
membership system on different nodes telling the dlm different stories
about who's in/out.
So, yes, the membership system ultimately needs to follow some algorithm
that guarantees agreement. There are rigorous, distributed ways of doing
that (your evs work which I look forward to using), and simpler methods,
e.g. driving it from some single point of control.
> What user application today works to configure the dlm services in the
> posted patch?
I've been using the command line program "dlm_tool" where I act as the
membership system myself. We're just putting together pieces that will
drive this from a membership system (like openais). Again, the pieces you
decide to use in userspace are flexible and depend on how you want to use
the dlm.
> With usage of SCTP protocol, there is now some idea of moving the
> protocol for cluster communication into the kernel and using SCTP as
> that protocol...
Neither SCTP nor the dlm are about cluster communication, they're both
about simple point-to-point messages. When you move up to userspace and
start talking about membership, then the issue of group communication
models comes up and your openais/evs work is very relevant. Might you be
misled about what SCTP does?
Dave
On Mon, 2005-04-25 at 22:49, David Teigland wrote:
> On Mon, Apr 25, 2005 at 02:54:58PM -0700, Steven Dake wrote:
> > On Mon, 2005-04-25 at 09:58, David Teigland wrote:
> > > The core dlm functions. Processes dlm_lock() and dlm_unlock() requests.
> > > Creates lockspaces which give applications separate contexts/namespaces in
> > > which to do their locking. Manages locks on resources' grant/convert/wait
> > > queues. Sends and receives high level locking operations between nodes.
> > > Delivers completion and blocking callbacks (ast's) to lock holders.
> > > Manages the distributed directory that tracks the current master node for
> > > each resource.
> > >
> >
> > David
> >
> > Very positive there are some submissions relating to cluster kernel work
> > for lkml to review.. good job..
> >
> > I have some questions on the implementation:
> >
> > It appears as though a particular processor is identified as the "lock
> > master" or processor that maintains the state of the lock. So for
> > example, if a processor wants to acquire a lock, it sends a reqeust to
> > the lock master which either grants or rejects the request for the
> > lock. What happens in the scenario that a lock master leaves the
> > current configuration? This scneario is very likely in practice.
>
> Of course, every time a node fails.
>
> > How do you synchronize the membership events that occur with the kernel
> > to kernel communication that takes place using SCTP?
>
> SCTP isn't much different than TCP, so I'm not sure how that's relevant.
> It's used primarily so we can take advantage of multi-homing when you have
> redundant networks.
>
> When the membership of a lockspace needs to change, whether adding or
> removing a node, activity is suspended in that lockspace on all the nodes
> using it. After all are suspended, the lockspace is then told (on all
> lockspace members) what the new membership is. Recovery then takes place:
> new masters are selected and waiting requests redirected.
>
>
> > It appears from your patches there is some external (userland)
> > application that maintains the current list of processors that qualify
> > as "lock servers".
>
> correct
>
> > Is there then a dependence on external membership algorithms?
>
> We simply require that the membership system is in agreement before the
> lockspace is told what the new members are. The membership system
> ultimately drives the lockspace membership and we can't have the
> membership system on different nodes telling the dlm different stories
> about who's in/out.
>
> So, yes, the membership system ultimately needs to follow some algorithm
> that guarantees agreement. There are rigorous, distributed ways of doing
> that (your evs work which I look forward to using), and simpler methods,
> e.g. driving it from some single point of control.
>
>
> > What user application today works to configure the dlm services in the
> > posted patch?
>
> I've been using the command line program "dlm_tool" where I act as the
> membership system myself. We're just putting together pieces that will
> drive this from a membership system (like openais). Again, the pieces you
> decide to use in userspace are flexible and depend on how you want to use
> the dlm.
>
>
> > With usage of SCTP protocol, there is now some idea of moving the
> > protocol for cluster communication into the kernel and using SCTP as
> > that protocol...
>
> Neither SCTP nor the dlm are about cluster communication, they're both
> about simple point-to-point messages. When you move up to userspace and
> start talking about membership, then the issue of group communication
> models comes up and your openais/evs work is very relevant. Might you be
> misled about what SCTP does?
>
Hate to admit ignorance, but I'm not really sure what SCTP does.. I
guess point to point communication like tcp but with some other kind of
characteristics.. I wanted to have some idea of how locking messages
are related to the current membership. I think I understand the system
from your descriptions and reading the code. One scenario I could see
happeing is that there are 2 processors A, B.
B drops out of membership
A sends lock to lock master B (but A doens't know B has dropped out of
membership yet)
B gets lock request, but has dropped out of membership or failed in some
way
In this case the order of lock messages with the membership changes is
important. This is the essential race that describes almost every issue
with distributed systems... virtual synchrony makes this scenario
impossible by ensuring that messages are ordered in relationship to
membership changes.
Do you intend to eventually move the point to point communication into
userspace, or keep it within kernel? I'd like to understand if there is
a general need for cluster communication as a kernel service, or the
intent is for all communication to be done in userspace...
You guys have done a good job here...
Can I ask a performance question.. How many locks per second can be
acquired and then released with a system of 2-3 processors? In the case
that the processor requesting the lock is on the lock server processor,
and in the case that the processor requesting the lock is not on the
lock server processor... (processor in this case is the system that
processes the lock operations). Assuming all locks are uncontended...
regards
-steve
> Dave
>
On Tuesday 26 April 2005 13:40, Steven Dake wrote:
> Hate to admit ignorance, but I'm not really sure what SCTP does.. I
> guess point to point communication like tcp but with some other kind of
> characteristics.. I wanted to have some idea of how locking messages
> are related to the current membership. I think I understand the system
> from your descriptions and reading the code. One scenario I could see
> happeing is that there are 2 processors A, B.
>
> B drops out of membership
> A sends lock to lock master B (but A doens't know B has dropped out of
> membership yet)
> B gets lock request, but has dropped out of membership or failed in some
> way
>
> In this case the order of lock messages with the membership changes is
> important. This is the essential race that describes almost every issue
> with distributed systems... virtual synchrony makes this scenario
> impossible by ensuring that messages are ordered in relationship to
> membership changes.
It sounds great, but didn't somebody benchmark your virtual synchrony code and
find that it only manages to deliver some tiny dribble of messages/second? I
could be entirely wrong about that, but I got the impression that your
algorithm as implemented is not in the right performance ballpark for
handling the cfs lock traffic itself.
If I'm right about the performance, there still might be a niche in the
membership algorithms, but even there I'd be worried about performance.
Obviously, what you need to make your case with is a demo. If anybody is
going to write that, it will almost certainly be you. You might consider
using my csnap code for your demo, because it has a pluggable infrastructure
interface which takes the form of a relatively simple C program
("csnap-agent") of which an example is provided that uses the cman interface.
You could simply replace the cman calls by virtual synchrony calls and show
us how amazing this algorithm really is.
http://sourceware.org/cluster/csnap/
All dependencies on cluster infrastructure - (g)dlm and cman - are
concentrated in that one "agent" file. Apart from that, the code depends
only on what you would expect to find in a standard 2.6 setup. You can even
run a non-clustered filesystem like ext3 on the csnap block device, so you
don't have to worry about setting up GFS either, for the time being. This
should be pretty much an ideal test platform for your ideas.
Regards,
Daniel
On Tue, 2005-04-26 at 15:24, Daniel Phillips wrote:
> On Tuesday 26 April 2005 13:40, Steven Dake wrote:
> > Hate to admit ignorance, but I'm not really sure what SCTP does.. I
> > guess point to point communication like tcp but with some other kind of
> > characteristics.. I wanted to have some idea of how locking messages
> > are related to the current membership. I think I understand the system
> > from your descriptions and reading the code. One scenario I could see
> > happeing is that there are 2 processors A, B.
> >
> > B drops out of membership
> > A sends lock to lock master B (but A doens't know B has dropped out of
> > membership yet)
> > B gets lock request, but has dropped out of membership or failed in some
> > way
> >
> > In this case the order of lock messages with the membership changes is
> > important. This is the essential race that describes almost every issue
> > with distributed systems... virtual synchrony makes this scenario
> > impossible by ensuring that messages are ordered in relationship to
> > membership changes.
>
> It sounds great, but didn't somebody benchmark your virtual synchrony code and
> find that it only manages to deliver some tiny dribble of messages/second? I
> could be entirely wrong about that, but I got the impression that your
> algorithm as implemented is not in the right performance ballpark for
> handling the cfs lock traffic itself.
Daniel,
Please point me at the benchmark. I am unaware of any claims that
virtual synchrony performs poorly... Your performance impressions may
be swayed by the benchmark results in this message...
We have been over this before... In September 2004, I posted benchmarks
to lkml (in a response to your questions about performance numbers)
which show messages per second of 7820 for 100 byte messages. I'd be
impressed to see any other protocol deliver that number of messages per
second (in and of itself), maintain self delivery, implicit
acknowledgement, agreed ordering, and virtual synchrony...
Here is the original response to your request for performance
information:
http://marc.theaimsgroup.com/?l=linux-kernel&m=109410546919884&w=2
The same benchmark run on the current code in bk (with turning off some
printk debug junk) is:
Intel Xeon 2.4ghz between a 2 node cluster on 100mbit WITH encryption
shows 15182 messages/sec. The improvement is from code improvements,
and also a clear factor of the cpu, since messages are packed which
consumes more cpu cycles. (note TP/S is messages per second, MB/s is
the megabytes per second of data delivered). On my network at home of
3.2/3.4ghz, I get about 30000 messages per second.
151825 Writes 100 bytes per write 10.000 Seconds runtime 15182.931
TP/s 1.518 MB/s.
140711 Writes 200 bytes per write 10.000 Seconds runtime 14071.252
TP/s 2.814 MB/s.
133149 Writes 300 bytes per write 10.000 Seconds runtime 13314.929
TP/s 3.994 MB/s.
120282 Writes 400 bytes per write 10.000 Seconds runtime 12028.057
TP/s 4.811 MB/s.
108876 Writes 500 bytes per write 10.000 Seconds runtime 10887.878
TP/s 5.444 MB/s.
99360 Writes 600 bytes per write 10.000 Seconds runtime 9936.053
TP/s 5.962 MB/s.
92615 Writes 700 bytes per write 10.000 Seconds runtime 9261.535
TP/s 6.483 MB/s.
85734 Writes 800 bytes per write 10.000 Seconds runtime 8573.459
TP/s 6.859 MB/s.
77132 Writes 900 bytes per write 10.000 Seconds runtime 7713.086
TP/s 6.942 MB/s.
71927 Writes 1000 bytes per write 10.000 Seconds runtime 7192.771
TP/s 7.193 MB/s.
68304 Writes 1100 bytes per write 10.000 Seconds runtime 6830.465
TP/s 7.514 MB/s.
65767 Writes 1200 bytes per write 10.000 Seconds runtime 6576.728
TP/s 7.892 MB/s.
64288 Writes 1300 bytes per write 10.000 Seconds runtime 6428.909
TP/s 8.358 MB/s.
When you consider that no reply is required to implement a lock service
with virtual synchrony, the performance is even more improved. This
essentially, could provide 15182 lock acquisitions per second on 2.4 ghz
cpu (if the lock request is 100 bytes).
Are you suggesting this is a dribble? What kind of performance would
you find acceptable?
Your suggestion, reworking redhat's cluster suite to use virtual
synchrony (as a demo?), sounds intrigueing. However, I just don't have
the bandwidth at this time to take on any more projects (although I am
happy to support redhat's use of virtual synchrony). The community,
however, would very much benefit from redhat leading such an effort.
regards,
-steve
On Tuesday 26 April 2005 19:04, Steven Dake wrote:
> ...Your performance impressions may be swayed by the benchmark results in
> this message...
>
> We have been over this before... In September 2004, I posted benchmarks
> to lkml (in a response to your questions about performance numbers)
> which show messages per second of 7820 for 100 byte messages.
Hi Steven,
The source of the benchmark I alluded to is lost in the mists of my foggy
memory, however the numbers you just gave seem to be about the same as I
remember.
I get >>several hundred thousand<< synchronization messages per second in my
cluster block devices, using ordinary tcp sockets over 100 MHz ethernet.
This may help put things in perspective.
> I'd be
> impressed to see any other protocol deliver that number of messages per
> second (in and of itself), maintain self delivery, implicit
> acknowledgement, agreed ordering, and virtual synchrony...
Well, the way I do it is so much faster than what you're seeing that I can
easily justify putting in the extra effort to resolve issues that virtual
synchrony would apparently solve for me automagically.
Please let me save the details for a post tomorrow. Then I will wax poetic
about what we do, or plan to do, to solve the various nasty problems that
come up as a result of membership changes, so that nobody runs such risks as
receiving messages from a node that thinks it is in the cluster but actually
isn't.
> <benchmarks>
> Are you suggesting this is a dribble?
Sorry, in my world, that's a dribble ;-)
I stand by my statement that this is too slow to handle the heavy lifting, and
is marginal even for "slow path" cluster recovery. But if you think
otherwise, you can easily prove it, see below.
> Your suggestion, reworking redhat's cluster suite to use virtual
> synchrony (as a demo?), sounds intrigueing. However, I just don't have
> the bandwidth at this time to take on any more projects (although I am
> happy to support redhat's use of virtual synchrony). The community,
> however, would very much benefit from redhat leading such an effort.
I did not suggest reworking Red Hat's cluster suite. I suggested reworking
_one file_ of my cluster snapshot device. This file was designed to be
reworked by someone such as yourself, even someone without an enormous amount
of time on their hands. This file (agent.c) does not handle the high-speed
block device synchronization, it only handles inter-node recovery messages
and other slow-path chores.
For your convenience, the cluster snapshot device can be operated entirely
independently of the rest of the cluster suite, and you don't even need a
cluster.
Regards,
Daniel
Forgive the double post
forgot to hit "reply all"
Daniel
The issue is virtual synchrony for dlm, not virtual synchrony for
synchronization of your block device work. However, since it appears
you would like to address it... While we could talk about your
particular design, virtual synchrony can synchronize at near wire speed
for larger messages with encryption. In this case, there would be
benefit to synchronizing larger blocks of data at once, instead of
individual (512 byte?) blocks of messages. I really doubt you would see
any performance improvement in the pure synchronization, although you
would get encryption and authentication of messages, and you would not
have those pesky race conditions. So, there would be security, at
little cost to performance.
Now on the topic of race conditions. Any system with race conditions
will eventually (in some short interval of operation) fail. Are you
suggesting that data is entrusted to a system with race conditions that
result in failures in short intervals?
But now back to the original point:
The context we are talking about here is dlm. In the dlm case, I find
it highly unlikely the posted patches can process 100,000 locks per
second (between processors) as your claim would seem to suggest. As the
benchmarks have not been posted, its hard to see. If the benchmarks are
beyond the 15,000 locks per second that could easily be processed with
virtual synchrony with an average speed processor, then please, correct
me.
Post dlm benchmarks Daniel...
regards
-steve
On Tue, 2005-04-26 at 17:53, Daniel Phillips wrote:
> On Tuesday 26 April 2005 19:04, Steven Dake wrote:
> > ...Your performance impressions may be swayed by the benchmark results in
> > this message...
> >
> > We have been over this before... In September 2004, I posted benchmarks
> > to lkml (in a response to your questions about performance numbers)
> > which show messages per second of 7820 for 100 byte messages.
>
> Hi Steven,
>
> The source of the benchmark I alluded to is lost in the mists of my foggy
> memory, however the numbers you just gave seem to be about the same as I
> remember.
>
> I get >>several hundred thousand<< synchronization messages per second in my
> cluster block devices, using ordinary tcp sockets over 100 MHz ethernet.
> This may help put things in perspective.
>
> > I'd be
> > impressed to see any other protocol deliver that number of messages per
> > second (in and of itself), maintain self delivery, implicit
> > acknowledgement, agreed ordering, and virtual synchrony...
>
> Well, the way I do it is so much faster than what you're seeing that I can
> easily justify putting in the extra effort to resolve issues that virtual
> synchrony would apparently solve for me automagically.
>
> Please let me save the details for a post tomorrow. Then I will wax poetic
> about what we do, or plan to do, to solve the various nasty problems that
> come up as a result of membership changes, so that nobody runs such risks as
> receiving messages from a node that thinks it is in the cluster but actually
> isn't.
>
> > <benchmarks>
> > Are you suggesting this is a dribble?
>
> Sorry, in my world, that's a dribble ;-)
>
> I stand by my statement that this is too slow to handle the heavy lifting, and
> is marginal even for "slow path" cluster recovery. But if you think
> otherwise, you can easily prove it, see below.
>
> > Your suggestion, reworking redhat's cluster suite to use virtual
> > synchrony (as a demo?), sounds intrigueing. However, I just don't have
> > the bandwidth at this time to take on any more projects (although I am
> > happy to support redhat's use of virtual synchrony). The community,
> > however, would very much benefit from redhat leading such an effort.
>
> I did not suggest reworking Red Hat's cluster suite. I suggested reworking
> _one file_ of my cluster snapshot device. This file was designed to be
> reworked by someone such as yourself, even someone without an enormous amount
> of time on their hands. This file (agent.c) does not handle the high-speed
> block device synchronization, it only handles inter-node recovery messages
> and other slow-path chores.
>
> For your convenience, the cluster snapshot device can be operated entirely
> independently of the rest of the cluster suite, and you don't even need a
> cluster.
>
> Regards,
>
> Daniel
On Tue, Apr 26, 2005 at 10:40:24AM -0700, Steven Dake wrote:
> Hate to admit ignorance, but I'm not really sure what SCTP does.. I
> guess point to point communication like tcp but with some other kind of
> characteristics.. I wanted to have some idea of how locking messages
> are related to the current membership. I think I understand the system
> from your descriptions and reading the code. One scenario I could see
> happeing is that there are 2 processors A, B.
>
> B drops out of membership
> A sends lock to lock master B (but A doens't know B has dropped out of
> membership yet)
> B gets lock request, but has dropped out of membership or failed in some
> way
>
> In this case the order of lock messages with the membership changes is
> important.
I think this might help clarify: no membership change is applied to the
lockspace on any nodes until the lockspace has first been suspended on
all. Suspending means no locking activity is processed. The lockspace on
all nodes is then told the new membership and does recovery. Locking is
then resumed.
If some lock message from a failed node is somehow still in the network
and arrives at one of the lockspace members now running again, it will
simply be ignored when they see it's from a non-member node.
> This is the essential race that describes almost every issue with
> distributed systems... virtual synchrony makes this scenario impossible
> by ensuring that messages are ordered in relationship to membership
> changes.
Yes, but the messages discussed in that context are a part of group
multicasts. Specifically, one message from a particular processor needs
to be delivered on all other processors, and all processors must agree on
the order of that group message with respect to membership changes. To do
that you do indeed need to employ VS algorithms.
I know you're more familiar with those details than I am. What I keep
trying to explain is that the dlm is in a different, simpler category.
The dlm's point-to-point messages (think client-server on a per lock
basis) just don't require the same rigorous approach used for _group_
communication.
> Do you intend to eventually move the point to point communication into
> userspace, or keep it within kernel?
In the kernel, there's nothing to it.
> I'd like to understand if there is a general need for cluster
> communication as a kernel service, or the intent is for all
> communication to be done in userspace...
I see no need for group communication in the kernel.
> Can I ask a performance question.. How many locks per second can be
> acquired and then released with a system of 2-3 processors?
We haven't done any measurements.
> In the case that the processor requesting the lock is on the lock server
> processor,
When a lock is requested on the master node, there's no communication, no
messages, it's all local processing. No one else in the cluster even
knows about the lock.
> and in the case that the processor requesting the lock is not
> on the lock server processor... (processor in this case is the system
> that processes the lock operations). Assuming all locks are
> uncontended...
OK, we'll try to do some measurements at some point.
Thanks,
Dave
On Tuesday 26 April 2005 21:50, Steven Dake wrote:
> Daniel
>
> The issue is virtual synchrony for dlm, not virtual synchrony for
> synchronization of your block device work.
Honestly, at 250 uSec/message you don't have a hope of convincing anybody to
use virtual synchrony in a dlm for anything other than slow path recovery.
And even the performance requirements for recovery are more stringent than
you would think.
> However, since it appears you would like to address it...
Actually, I just pointed out a practical opportunity for you to demonstrate
the advantages of virtual synchrony in this context.
> While we could talk about your
> particular design, virtual synchrony can synchronize at near wire speed
> for larger messages with encryption.
Maybe so, but the short message case is vastly more important. And you would
actually have to work at it to fall much below wire speed on long messages.
> In this case, there would be
> benefit to synchronizing larger blocks of data at once, instead of
> individual (512 byte?) blocks of messages.
That is exactly what I do, of course. More specifically, I let Linux do it
for me. My average synchronization message size is 20 bytes including
header. I also have very low latency because the wrapper on the raw tcp is
thin. Maybe I should strip away the last layer and ride right on the
ethernet transport, to win another 10%. Not that I really need it. And not
that not really needing it means I am prepared to give any up!
> I really doubt you would see
> any performance improvement in the pure synchronization, although you
> would get encryption and authentication of messages, and you would not
> have those pesky race conditions.
What race conditions? If you can spot any unhandled races, please let me know
as soon as possible.
> So, there would be security, at little cost to performance.
Security has not yet become an issue for shared-disk filesystems, since any
node with root basically owns the shared disk. Hopefully, one day we will do
something about that[1]. Even then chances are, only the packet headers need
encrypting (or signing, more likely).
> Now on the topic of race conditions. Any system with race conditions
> will eventually (in some short interval of operation) fail. Are you
> suggesting that data is entrusted to a system with race conditions that
> result in failures in short intervals?
Surely you mean unhandled races? Please show me any in my code, and I will be
eternally indebted.
> But now back to the original point:
>
> The context we are talking about here is dlm. In the dlm case, I find
> it highly unlikely the posted patches can process 100,000 locks per
> second (between processors) as your claim would seem to suggest.
Read the code and prepare to be amazed ;-)
> As the benchmarks have not been posted, its hard to see.
I posted some results for ddraid earlier, here and on linux-cluster. There
will be charts and graphs as I get time. Anybody who is impatient can grab
the code and make charts for themselves. (But beware: the cluster snapshot
has a nasty bottleneck in the server, due to no parallel disk IO. The
cluster raid is ok.)
> If the benchmarks
> are beyond the 15,000 locks per second that could easily be processed with
> virtual synchrony with an average speed processor, then please, correct
> me.
Consider yourself corrected.
> Post dlm benchmarks Daniel...
I'm dancing as fast as I can ;-)
[1] I already lost a rather expensive bet re cluster security becoming a
checkbox item within the past year.
Regards,
Daniel
One thing I noticed (besides some trailing whitespace):
On 26/04/05 00:58 +0800, David Teigland wrote:
> + if (!r->res_lvbptr)
> + r->res_lvbptr = allocate_lvb(r->res_ls);
> +
> + if (!r->res_lvbptr)
> + return;
This suggests allocate_lvb can fail.
> +
> + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
...
However...
> +
> + if (!r->res_lvbptr)
> + r->res_lvbptr = allocate_lvb(r->res_ls);
> +
> + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
So... can it fail?
Domen
On Wed, Apr 27, 2005 at 02:33:40PM +0200, Domen Puncer wrote:
> > + if (!r->res_lvbptr)
> > + r->res_lvbptr = allocate_lvb(r->res_ls);
> > +
> > + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, DLM_LVB_LEN);
>
> So... can it fail?
Yes, fixed now, thanks
Dave
On 2005-04-27T11:02:17, David Teigland <[email protected]> wrote:
Let me chime in here, because defining the properties of the membership
events delivered to the DLM is really important to figure out if/how it
can be integrated with other stacks.
> > In this case the order of lock messages with the membership changes is
> > important.
> I think this might help clarify: no membership change is applied to the
> lockspace on any nodes until the lockspace has first been suspended on
> all. Suspending means no locking activity is processed. The lockspace on
> all nodes is then told the new membership and does recovery. Locking is
> then resumed.
So in effect, the delivery of the suspend/membership distribution/resume
events are three cluster-wide barriers?
I can see how that simplifies the recovery algorithm.
And, I assume that the delivery of a "node down" membership event
implies that said node also has been fenced.
So we can't deliver it raw membership events. Noted.
> I know you're more familiar with those details than I am. What I keep
> trying to explain is that the dlm is in a different, simpler category.
Agreed. This is something I noticed when I looked at how the DLM fits
into the global cluster resource management architecture, too.
For example, if you talk to Stephen ;), you'll be told that every
cluster resource is essentially a lock. But, our resources have complex
dependencies, start/stop ordering etc; a DLM which tried to map these
would blow up completely.
So, we have the "top-level" "lock manager", our CRM, which manages these
complex "locks". However, it's also worth noting that there's rather few
of them to manage, and they don't change very often.
Now, the DLM has simpler locking semantics, but it manages magnitudes
more of them, and faster so.
If you want to think about this in terms of locking hierarchy, it's the
high-level feature rich sophisticated aka bloated lock manager which
controls the "lower level" faster and more scalable "sublockspace" and
coordinates it in terms of the other complex objects (like fencing,
applications, filesystems etc).
Just some food for thought how this all fits together rather neatly.
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On Wed, Apr 27, 2005 at 03:41:42PM +0200, Lars Marowsky-Bree wrote:
> So in effect, the delivery of the suspend/membership distribution/resume
> events are three cluster-wide barriers?
>
> I can see how that simplifies the recovery algorithm.
Correct. I actually consider it two external barriers: the first after
the lockspace has been suspended, the second after lockspace recovery is
completed.
> And, I assume that the delivery of a "node down" membership event
> implies that said node also has been fenced.
Typically it does if you're combining the dlm with something that requires
fencing (like a file system). Fencing isn't relevant to the dlm itself,
though, since the dlm software isn't touching any storage.
> So we can't deliver it raw membership events. Noted.
That's right, it requires more intelligence on the part of the external
management system in userspace.
> If you want to think about this in terms of locking hierarchy, it's the
> high-level feature rich sophisticated aka bloated lock manager which
> controls the "lower level" faster and more scalable "sublockspace" and
> coordinates it in terms of the other complex objects (like fencing,
> applications, filesystems etc).
>
> Just some food for thought how this all fits together rather neatly.
Interesting, and sounds correct. I must admit that using the word "lock"
to describe these CRM-level inter-dependent objects is new to me.
Dave
On Wednesday 27 April 2005 09:41, Lars Marowsky-Bree wrote:
> ...I assume that the delivery of a "node down" membership event
> implies that said node also has been fenced.
>
> So we can't deliver it raw membership events. Noted.
Just to pick a nit: there is no way to be sure a membership event might not
still be on the way to the dead node, however the rest of the cluster knows
the node is dead and can ignore it, in theory. (In practice, only (g)dlm and
gfs are well glued into the cman membership protocol, and other components,
e.g., cluster block devices and applications, need to be looked at with
squinty eyes.)
Regards,
Daniel
On Wednesday 27 April 2005 09:41, Lars Marowsky-Bree wrote:
> If you want to think about this in terms of locking hierarchy, it's the
> high-level feature rich sophisticated aka bloated lock manager which
> controls the "lower level" faster and more scalable "sublockspace" and
> coordinates it in terms of the other complex objects (like fencing,
> applications, filesystems etc).
>
> Just some food for thought how this all fits together rather neatly.
It's actually the membership system that glues it all together. The dlm is
just another service.
Regards,
Daniel
On 2005-04-27T22:52:55, Daniel Phillips <[email protected]> wrote:
> > So we can't deliver it raw membership events. Noted.
>
> Just to pick a nit: there is no way to be sure a membership event might not
> still be on the way to the dead node, however the rest of the cluster knows
> the node is dead and can ignore it, in theory. (In practice, only (g)dlm and
> gfs are well glued into the cman membership protocol, and other components,
> e.g., cluster block devices and applications, need to be looked at with
> squinty eyes.)
I'm sorry, I don't get what you are saying here. Could you please
clarify?
"Membership even on the way to the dead node"? ie, you mean that the
(now dead) node hasn't acknowledged a previous membership which still
included it, because it died inbetween? Well, sure, membership is never
certain at all; it's always in transition, essentially, because we can
only detect faults some time after the fact.
(It'd be cool if we could mandate nodes to pre-announce failures by a
couple of seconds, alas I think that's a feature you'll only find in an
OSDL requirement document, rated as "prio 1" ;-)
I also don't understand what you're saying in the second part. How are
gdlm/gfs "well glued" into the CMAN membership protocol, and what are we
looking for when we turn our squinty eyes to applications...?
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On 2005-04-27T22:26:38, David Teigland <[email protected]> wrote:
> > So in effect, the delivery of the suspend/membership distribution/resume
> > events are three cluster-wide barriers?
> >
> > I can see how that simplifies the recovery algorithm.
> Correct. I actually consider it two external barriers: the first after
> the lockspace has been suspended, the second after lockspace recovery is
> completed.
Hmmm. This is actually slightly different from what I thought you were
doing.
Actually, is that several phase step really necessary? Given that
failures can occur at any given point even then - during each barrier
and in-between - couldn't we just as well deliver the membership event
directly, and proceed with recovery as if that was the "final" state?
We always have to deal with nodes failing, rejoining etc at any given
step and eventually restarting the algorithm if needed.
(Well, a node joining you could serialize and only do that after you
have completed the recovery steps for the event before. But a failing
node during recovery seems to imply the need to restart the algorithm
anyway, and that's just what would happen if a new membership event was
delivered.)
Does it really simplify the recovery, or does it just obscure the
complexity, ie, snake oil?
That said, the model can be mapped as-is quite directly to how the
heartbeat 2.x handles resources which can be active more than once. The
first barrier would be the "we lost a node and are about to fence one of
your incarnations" (or "a node joined and we're about to start one"),
and the second one would be the "we fenced node X" or "we started you on
node X".
However, there's one property here: We assume that those notifications
_can never fail_; they are delivered (and guaranteed to be before we
commence the main operation), and that's it. Can a node in your model
choose to reject the suspend/resume operation?
> > And, I assume that the delivery of a "node down" membership event
> > implies that said node also has been fenced.
> Typically it does if you're combining the dlm with something that requires
> fencing (like a file system). Fencing isn't relevant to the dlm itself,
> though, since the dlm software isn't touching any storage.
Ack. Good point, I was thinking too much in terms of GFS/OCFS2 here ;-)
> > Just some food for thought how this all fits together rather neatly.
> Interesting, and sounds correct. I must admit that using the word "lock"
> to describe these CRM-level inter-dependent objects is new to me.
It's locks with dependencies, instead of one "lock" per resource group.
That's been mulling on the back of my mind ever since Stephen gave me
the DLM-centric-clustering-world talking to at Linux Kongress 98, I
think ;-) By now I think the model fits.
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On 2005-04-28T02:49:04, Daniel Phillips <[email protected]> wrote:
> > Just some food for thought how this all fits together rather
> > neatly.
>
> It's actually the membership system that glues it all together. The
> dlm is just another service.
Membership is one of the lowest level and high privileged inputs to the
whole picture, of course.
However, "membership" is already a pretty broad term, and one must
clearly state what one is talking about. So we're clearly focused on
node membership here, which is a special case of group membership; the
top-level, sort of.
Then every node has it's local view of node membership, constructed
typically from observing node heartbeats.
Then the nodes communicate to reach concensus on the coordinated
membership, which will usually be a set of nodes with full N:N
connectivity (via the cluster messaging mechanism); and they'll also
usually aim to identify the largest possible set.
Eventually, there'll be a membership view which also implies certain
shared data integrity guarantees if appropriate (ie, fencing in case a
node didn't go down cleanly, and granting access on a clean join).
These steps but the last one usually happen completely internal to the
membership layer; the last one requires coordination already, because
the fencing layer itself might need recovery before it can fence
something after a node failure.
And then there's quorum computation.
Certainly you could also try looking at it from a membership-centric
angle, but the piece which coordinates the recovery of the various
components which makes sure the right kind of membership events are
delivered in the proper order, and errors during component recovery are
appropriately handled, is, I think, pretty much distinct from the
"membership" and a higher level component.
So I'm not sure I'd buy "the membership is what glues it all together"
on eBay even for a low starting bid.
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On Thu, 2005-04-28 at 05:33, Lars Marowsky-Bree wrote:
> On 2005-04-27T22:26:38, David Teigland <[email protected]> wrote:
<snip>
>
> > > And, I assume that the delivery of a "node down" membership event
> > > implies that said node also has been fenced.
> > Typically it does if you're combining the dlm with something that requires
> > fencing (like a file system). Fencing isn't relevant to the dlm itself,
> > though, since the dlm software isn't touching any storage.
>
> Ack. Good point, I was thinking too much in terms of GFS/OCFS2 here ;-)
>
Since a DLM is a distributed lock manager, its usage is entirely for
locking some shared resource (might not be storage, might be shared
state, shared data, etc). If the DLM can grant a lock, but not
guarantee that other nodes (including the ones that have been kicked
out of the cluster membership) do not have a conflicting DLM lock, then
any applications that depend on the DLM for protection/coordination
be in trouble. Doesn't the GFS code depend on the DLM not being
recovered until after fencing of dead nodes?
Is there a existing DLM that does not depend on fencing? (you said
yours was modeled after the VMS DLM, didn't they depend on fencing?)
How would an application use a DLM that does not depend on fencing?
Thanks,
Daniel
On 2005-04-28T09:39:22, Daniel McNeil <[email protected]> wrote:
> Since a DLM is a distributed lock manager, its usage is entirely for
> locking some shared resource (might not be storage, might be shared
> state, shared data, etc). If the DLM can grant a lock, but not
> guarantee that other nodes (including the ones that have been kicked
> out of the cluster membership) do not have a conflicting DLM lock, then
> any applications that depend on the DLM for protection/coordination
> be in trouble. Doesn't the GFS code depend on the DLM not being
> recovered until after fencing of dead nodes?
It makes a whole lot of sense to combine a DLM with (appropriate)
fencing so that the shared resources are protected. I understood David's
comment to rather imply that fencing is assumed to happen outside the
DLM's world in a different component; ie more of a comment on sane
modularization instead of sane real-world configuration.
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On Thursday 28 April 2005 08:37, Lars Marowsky-Bree wrote:
> On 2005-04-27T22:52:55, Daniel Phillips <[email protected]> wrote:
> > > So we can't deliver it raw membership events. Noted.
> >
> > Just to pick a nit: there is no way to be sure a membership event might
> > not still be on the way to the dead node, however the rest of the cluster
> > knows the node is dead and can ignore it, in theory. (In practice, only
> > (g)dlm and gfs are well glued into the cman membership protocol, and
> > other components, e.g., cluster block devices and applications, need to
> > be looked at with squinty eyes.)
>
> I'm sorry, I don't get what you are saying here. Could you please
> clarify?
>
> "Membership even on the way to the dead node"? ie, you mean that the
> (now dead) node hasn't acknowledged a previous membership which still
> included it, because it died inbetween? Well, sure, membership is never
> certain at all; it's always in transition, essentially, because we can
> only detect faults some time after the fact.
Exactly, and that is what the barriers are for. I like the concept of
barriers a whole lot. We should put this interface on a pedestal and really
dig into how to use it, or even better, how to optimize it.
But for now, as I understand it, a cluster client's view of the cluster world
is entirely via cman events, which include things like other nodes joining
and leaving service groups. (Service groups are another interface we need to
put on a pedestal, and start working on, because right now it's a clean idea,
not thought all the way through.)
> (It'd be cool if we could mandate nodes to pre-announce failures by a
> couple of seconds, alas I think that's a feature you'll only find in an
> OSDL requirement document, rated as "prio 1" ;-)
Heh, I generally think about failing over in less than a second, preferably
much, much less. Maybe you just have to scale your heuristic a little?
> I also don't understand what you're saying in the second part. How are
> gdlm/gfs "well glued" into the CMAN membership protocol, and what are we
> looking for when we turn our squinty eyes to applications...?
Gdlm and gfs are well-glued because Dave and Patrick have been working on it
for years. Other components barely know about the interfaces, let alone use
them correctly. In the end _every component_ of the cluster stack has to do
the dance correctly on every node. We've really only just started on that
path. Hopefully we'll be able to move down it much more quickly now that the
code is coming out of the cathedral.
Regards,
Daniel
On Thursday 28 April 2005 08:55, Lars Marowsky-Bree wrote:
> On 2005-04-28T02:49:04, Daniel Phillips <[email protected]> wrote:
> > > Just some food for thought how this all fits together rather
> > > neatly.
> >
> > It's actually the membership system that glues it all together. The
> > dlm is just another service.
>
> Membership is one of the lowest level and high privileged inputs to the
> whole picture, of course.
>
> However, "membership" is already a pretty broad term, and one must
> clearly state what one is talking about. So we're clearly focused on
> node membership here, which is a special case of group membership; the
> top-level, sort of.
Indeed, you caught me being imprecise. By "membership system" I mean cman,
which includes basic cluster membership, service groups, socket interface,
event messages, PF_CLUSTER, and a few other odds and ends. Really, it _is_
our cluster infrastructure. And it has warts, some really giant ones. At
least it did the last time I used it. There is apparently a new,
much-improved version I haven't seen yet. I have heard that the re-rolled
cman is in cvs somewhere. Patrick? Dave?
> Then every node has it's local view of node membership, constructed
> typically from observing node heartbeats.
Actually, it is constructed from observing cman events over the socket.
I see that some fantastical /sys/ filesystem has wormed itself into the
machinery. I need to check that this hasn't compromised the basic beauty of
the event messaging model.
Fencing is a whole nuther issue. It's sort of unclear how it is actually
supposed to work, and judging from the number of complaints I see about it on
mailing lists, it doesn't work very well. We need to take a good look at
that.
> Then the nodes communicate to reach concensus on the coordinated
> membership, which will usually be a set of nodes with full N:N
> connectivity (via the cluster messaging mechanism); and they'll also
> usually aim to identify the largest possible set.
Yes. "Reaching consensus" is signalled to each node by cman sending a
"finish" event, as in "finish recovering". (To be sure, this is misleading
terminology. We should kill it before it has a chance to reproduce.)
> Eventually, there'll be a membership view which also implies certain
> shared data integrity guarantees if appropriate (ie, fencing in case a
> node didn't go down cleanly, and granting access on a clean join).
Each node's membership view is simply the cumulative state implied by the cman
events. Necessarily, this view will suffer some skew across the cluster.
All cluster algorithms _must_ recognize and accomodate that. This is where
barriers come into play, though that mechanism is buried inside cman, and
each node's view of barrier operations consists of cman events. (The way
this is actually implemented smells a little scary to me, but it seems to
work ok for small numbers of nodes.)
> These steps but the last one usually happen completely internal to the
> membership layer; the last one requires coordination already, because
> the fencing layer itself might need recovery before it can fence
> something after a node failure.
Right, we need to do a lot more work on the fencing interface. For example, I
haven't even begun to analyze it from the point of view of memory inversion
deadlock. My spider sense tells me there is some of that in there. Fencing
is currently done via bash scripts, which alone sucks nearly beyond belief.
> And then there's quorum computation.
Aha! There is a beautiful solution in the case of ddraid, i.e., any cluster
with (m of n) redundant shared disks resident on the nodes themselves:
http://sourceware.org/cluster/ddraid/
For ddraid order 1 and higher, there is no quorum ambiguity at all, because
you _require_ a quorum of data nodes in order for any node to access the
cluster filesystem data. For example, for a five node ddraid distributed
data cluster, you need four data nodes active or the cluster will only be
able to sit there stupidly doing nothing. Four data nodes is therefore the
quorum group ordained by God. Non-data nodes can come and go as they please,
without ever worrying about split brain or other nasty quorum-related
diseases.
> Certainly you could also try looking at it from a membership-centric
> angle, but the piece which coordinates the recovery of the various
> components which makes sure the right kind of membership events are
> delivered in the proper order, and errors during component recovery are
> appropriately handled, is, I think, pretty much distinct from the
> "membership" and a higher level component.
Sorry for the red herring. Where I wrote "membership" I meant to write
"cman", that is, cluster management.
> So I'm not sure I'd buy "the membership is what glues it all together"
> on eBay even for a low starting bid.
Though I'm not sure the concept is for sale, your buy-in will be appreciated
nonetheless, no matter how many limp jokes we need to put up with on the way
there.
Regards,
Daniel
On Thu, Apr 28, 2005 at 08:26:35PM -0400, Daniel Phillips wrote:
> Indeed, you caught me being imprecise. By "membership system" I mean cman,
> which includes basic cluster membership, service groups, socket interface,
> event messages, PF_CLUSTER, and a few other odds and ends. Really, it _is_
> our cluster infrastructure. And it has warts, some really giant ones. At
> least it did the last time I used it. There is apparently a new,
> much-improved version I haven't seen yet. I have heard that the re-rolled
> cman is in cvs somewhere. Patrick? Dave?
Daniel, please stop commenting on all this. Nothing you have said is
remotely correct. Anyone who's bothered to listen to this stuff, please
erase everything you've heard or you'll be dreadfully misled. And that's
as kind as I can put it.
Dave
On Thursday 28 April 2005 22:52, David Teigland wrote:
> On Thu, Apr 28, 2005 at 08:26:35PM -0400, Daniel Phillips wrote:
> > Indeed, you caught me being imprecise. By "membership system" I mean
> > cman, which includes basic cluster membership, service groups, socket
> > interface, event messages, PF_CLUSTER, and a few other odds and ends.
> > Really, it _is_ our cluster infrastructure. And it has warts, some
> > really giant ones. At least it did the last time I used it. There is
> > apparently a new, much-improved version I haven't seen yet. I have heard
> > that the re-rolled cman is in cvs somewhere. Patrick? Dave?
>
> ...Nothing you have said is remotely correct.
Please provide corrections, if you wish Dave.
Regards,
Daniel
On Thu, Apr 28, 2005 at 09:39:22AM -0700, Daniel McNeil wrote:
> Since a DLM is a distributed lock manager, its usage is entirely for
> locking some shared resource (might not be storage, might be shared
> state, shared data, etc). If the DLM can grant a lock, but not
> guarantee that other nodes (including the ones that have been kicked
> out of the cluster membership) do not have a conflicting DLM lock, then
> any applications that depend on the DLM for protection/coordination
> be in trouble. Doesn't the GFS code depend on the DLM not being
> recovered until after fencing of dead nodes?
No, it doesn't. GFS depends on _GFS_ not being recovered until failed
nodes are fenced. Recovering GFS is an entirely different thing from
recovering the DLM. GFS actually writes to shared storage.
> Is there a existing DLM that does not depend on fencing? (you said
> yours was modeled after the VMS DLM, didn't they depend on fencing?)
I've never heard of a DLM that depends on fencing.
> How would an application use a DLM that does not depend on fencing?
Go back to the definition of i/o fencing: i/o fencing simply prevents a
machine from modifying shared storage. This is often done by disabling
the victim's connection to the shared storage. Notice shared storage is
part of the definition, without it, fencing is irrelevant.
Fencing is not mainly about the node, it's mainly about the storage. When
a fencing victim is disconnected from storage, it usually means its SAN
port has been turned off on a switch. Notice that this doesn't touch or
effect the node at all -- it simply blocks any i/o from the node before it
reaches the storage.
Any distributed app using the DLM that writes only to its own local
storage will not need fencing, there's nothing to fence.
Dave
On Thursday 28 April 2005 12:45, Lars Marowsky-Bree wrote:
> On 2005-04-28T09:39:22, Daniel McNeil <[email protected]> wrote:
> > Since a DLM is a distributed lock manager, its usage is entirely for
> > locking some shared resource (might not be storage, might be shared
> > state, shared data, etc). If the DLM can grant a lock, but not
> > guarantee that other nodes (including the ones that have been kicked
> > out of the cluster membership) do not have a conflicting DLM lock, then
> > any applications that depend on the DLM for protection/coordination
> > be in trouble. Doesn't the GFS code depend on the DLM not being
> > recovered until after fencing of dead nodes?
>
> It makes a whole lot of sense to combine a DLM with (appropriate)
> fencing so that the shared resources are protected. I understood David's
> comment to rather imply that fencing is assumed to happen outside the
> DLM's world in a different component; ie more of a comment on sane
> modularization instead of sane real-world configuration.
But just because fencing is supposed to happen in an external component,
we can't wave our hands at it and skip the analysis. We _must_ identify the
fencing assumptions and trace the fencing paths with respect to every
recovery algorithm in every cluster component, including the dlm.
I suspect that when we do get around to properly scrutinizing fencing
requirements of specific recovery algorithms, we will find that the fencing
system currently on offer for gfs needs a little work.
Regards,
Daniel
On Thu, 2005-04-28 at 21:01, David Teigland wrote:
> On Thu, Apr 28, 2005 at 09:39:22AM -0700, Daniel McNeil wrote:
>
> > Since a DLM is a distributed lock manager, its usage is entirely for
> > locking some shared resource (might not be storage, might be shared
> > state, shared data, etc). If the DLM can grant a lock, but not
> > guarantee that other nodes (including the ones that have been kicked
> > out of the cluster membership) do not have a conflicting DLM lock, then
> > any applications that depend on the DLM for protection/coordination
> > be in trouble. Doesn't the GFS code depend on the DLM not being
> > recovered until after fencing of dead nodes?
>
> No, it doesn't. GFS depends on _GFS_ not being recovered until failed
> nodes are fenced. Recovering GFS is an entirely different thing from
> recovering the DLM. GFS actually writes to shared storage.
>
> > Is there a existing DLM that does not depend on fencing? (you said
> > yours was modeled after the VMS DLM, didn't they depend on fencing?)
>
> I've never heard of a DLM that depends on fencing.
>
> > How would an application use a DLM that does not depend on fencing?
>
> Go back to the definition of i/o fencing: i/o fencing simply prevents a
> machine from modifying shared storage. This is often done by disabling
> the victim's connection to the shared storage. Notice shared storage is
> part of the definition, without it, fencing is irrelevant.
>
> Fencing is not mainly about the node, it's mainly about the storage. When
> a fencing victim is disconnected from storage, it usually means its SAN
> port has been turned off on a switch. Notice that this doesn't touch or
> effect the node at all -- it simply blocks any i/o from the node before it
> reaches the storage.
>
> Any distributed app using the DLM that writes only to its own local
> storage will not need fencing, there's nothing to fence.
Dave,
I have always thought about clustering based on the VMS design.
I googled around and found some OpenVMS slides that described it like
this -- text only :)
------
The Connection Manager is code within OpenVMS that coordinates
cluster membership across events such as:
- Forming a cluster initially
- Allowing a node to join the cluster
- Cleaning up after a node which has failed or left the cluster
all the while protecting against uncoordinated access to shared
resources such as disks
------
The Connection Manager enforces the Quorum Scheme to ensure that all
access to shared resources is coordinated
Basic idea: A majority of the potential cluster systems must be
present in the cluster before any access to shared resources
(i.e. disks) is allowed
------
If a cluster member is not part of a cluster with quorum, OpenVMS
keeps it from doing any harm by:
- Putting all disks into Mount Verify state, thus stalling all
disk I/O operations
- Requiring that all processes can only be scheduled to run on
a CPU with the QUORUM capability bit set
- Clearing the QUORUM capability bit on all CPUs in the system,
thus preventing any process from being scheduled to run on a
CPU and doing any work
------
So the Connection Manager controlled membership, quorum, and fencing
(stalling all disk i/o, etc). AFAIR, the DLM would get a membership
event and do recovery after quorum and fencing. From the description
above, nodes not part of the membership with quorum could not do
anything.
I have always thought that an distributed application could use the DLM
alone to protect access to shared storage. The DLM would coordinate
access between the distributed application running on the nodes
in the cluster AND DLM locks would not be recovered and possibly
granted to applications running on the nodes still in the membership
until after nodes that are no longer a member of the cluster are safely
prevented from doing any harm.
So, when I said that the DLM was dependent on fencing, I was thinking
of the membership, quorum, prevention of harm (stalling of i/o to
prevent corrupting shared resource) as described above.
So, if an application was using your DLM to protect shared storage,
I think you are saying it possible the DLM lock could be granted
before the node that was previously holding the lock and now is not
part of the cluster is fenced. Is that right?
If it is, what prevents GFS from getting a DLM lock granted and writing
to the shared storage before the node that previously had it is fenced?
Daniel
PS if an application is writing to local storage, what does it need a
DLM for?
On Fri, Apr 29, 2005 at 03:58:29PM -0700, Daniel McNeil wrote:
> So the Connection Manager controlled membership, quorum, and fencing
> (stalling all disk i/o, etc). AFAIR, the DLM would get a membership
> event and do recovery after quorum and fencing.
Right
> From the description above, nodes not part of the membership with quorum
> could not do anything.
Right
> I have always thought that an distributed application could use the DLM
> alone to protect access to shared storage. The DLM would coordinate
> access between the distributed application running on the nodes
> in the cluster AND DLM locks would not be recovered and possibly
> granted to applications running on the nodes still in the membership
> until after nodes that are no longer a member of the cluster are safely
> prevented from doing any harm.
>
> So, when I said that the DLM was dependent on fencing, I was thinking
> of the membership, quorum, prevention of harm (stalling of i/o to
> prevent corrupting shared resource) as described above.
>
> So, if an application was using your DLM to protect shared storage,
> I think you are saying it possible the DLM lock could be granted
> before the node that was previously holding the lock and now is not
> part of the cluster is fenced. Is that right?
It depends on how the clustering infrastructure coordinates the various
aspects of recovery. The dlm doesn't specify how that's done because
there's no universal answer. If it's important to your application that
fencing happens before the dlm grants locks from failed nodes, then you
need to be sure that's how the infrastructure coordinates recovery of
fencing, the dlm and your application.
I can talk about GFS's requirements on how fencing and dlm recovery
happen, but other apps will be different.
GFS requires that a gfs fs has been "suspended" (told that recovery will
be happening) on all nodes _before_ the dlm grants locks from failed nodes
(i.e. before the dlm starts recovery). Because the dlm grants locks
previously held by failed nodes when its recovery completes, gfs has a
much stricter standard for using dlm locks while it's in this suspended
state: the gfs recovery process can acquire new locks, but no one else.
That leaves GFS's fencing requirement. GFS requires that a failed node be
fenced prior to gfs being told to begin recovery for that node (which
involves recovering the journal of the failed node.)
So for gfs, it's important that fencing and dlm recovery both happen
before gfs recovery, but the order of fencing and dlm recovery (with
respect to each other) doesn't matter. As I said, the dlm doesn't require
that fencing happen first, but as you suggest, an application may want it
that way.
> PS if an application is writing to local storage, what does it need a
> DLM for?
My experience is pretty limited, but I suspect there are distributed
applications, requiring synchronization, that don't write to shared
storage.
Thanks for the info and good questions
Dave
On Friday 29 April 2005 18:58, Daniel McNeil wrote:
> I have always thought that an distributed application could use the DLM
> alone to protect access to shared storage. The DLM would coordinate
> access between the distributed application running on the nodes
> in the cluster AND DLM locks would not be recovered and possibly
> granted to applications running on the nodes still in the membership
> until after nodes that are no longer a member of the cluster are safely
> prevented from doing any harm.
As you know, this is how I currently determine ownership of such resources as
cluster snapshot metadata and ddraid dirty log. I find the approach
distinctly unsatisfactory. The (g)dlm is rather verbose to use, particularly
taking into the account the need to have two different state machine paths,
depending on whether a lock happens to master locally or not, and the need to
coordinate a number of loosely coupled elements: lock status blocks, asts,
the calls themselves. The result is quite a _long_ and opaque program to do
a very simple thing. It is full of long chains of reasoning, connected with
the behavior of lvbs, asynchronous lock event flow, error behavior, myriad
other details. This just _feels wrong_ and the code looks ugly, no matter
how much I try to dress it up.
And indeed, instinct turns out to be correct: there is a far simpler way to
handle this: let the oldest member of the cluster decide who owns the
metadata resources. This is simple, unambiguous, fast, efficient, easy to
implement and obviously correct. And it has nothing to do with the dlm, it
relies only on cman. Or it would, if cman supported a stable ordering of
cluster node longevity, which I do not think it does. (Please correct me if
I'm wrong, Patrick.)
So this is easy: fix cman so that it does support a stable ordering of cluster
node membership age, if it does not already.
> So, when I said that the DLM was dependent on fencing, I was thinking
> of the membership, quorum, prevention of harm (stalling of i/o to
> prevent corrupting shared resource) as described above.
>
> So, if an application was using your DLM to protect shared storage,
> I think you are saying it possible the DLM lock could be granted
> before the node that was previously holding the lock and now is not
> part of the cluster is fenced. Is that right?
>
> If it is, what prevents GFS from getting a DLM lock granted and writing
> to the shared storage before the node that previously had it is fenced?
In my opinion, using the dlm to protect the shared storage resource
constitutes tackling the problem far too high up on the food chain.
> PS if an application is writing to local storage, what does it need a
> DLM for?
Good instinct. In fact, as I've said before, you don't necessarily need a dlm
in a cluster application at all. What you need is _global synchronization_,
however that is accomplished. For example, I have found it simpler and more
efficient to use network messaging for the cluster applications I've tackled
so far. This suggests to me that the dlm is going to end up pretty much as
a service needed only by a cfs, and not much else. The corollary of that is,
we should concentrate on making the dlm work well for the cfs, and not get
too wrapped up in trying to make it solve every global synchronization
problem in the world.
Regards,
Daniel
On 2005-04-30T05:09:24, Daniel Phillips <[email protected]> wrote:
> And indeed, instinct turns out to be correct: there is a far simpler
> way to handle this: let the oldest member of the cluster decide who
> owns the metadata resources. This is simple, unambiguous, fast,
> efficient, easy to implement and obviously correct. And it has
> nothing to do with the dlm, it relies only on cman. Or it would, if
> cman supported a stable ordering of cluster node longevity, which I do
> not think it does. (Please correct me if I'm wrong, Patrick.)
>
> So this is easy: fix cman so that it does support a stable ordering of
> cluster node membership age, if it does not already.
This is actually a property the Concensus Cluster Membership layer in
heartbeat provides.
Also, this was how I originally intended the master to coordinate
resource recovery: rely on the global ordering of nodes provided by the
CCM layer and just pick the first one. So, I'm with you on the general
direction ;-)
However, further thinking has shown this to not be a really good idea.
The principle is sound, but the oldest node might not be running the
newest version of the software, or just because the node shows up in the
node membership doesn't imply it's running the piece of software in
question at all (in the absence of group membership/services). And,
after a merge, one of the two (or more) masters better stop doing what
it's doing right now and hand over to the new one, so there's the need
for a barrier.
So, CRM now employs a voting algorithm as part of the application level
join protocol, which is designed to make sure that a) only nodes which
actually run our piece of software participate, and b) the node with the
most recent software release wins. (Plus some other parameters, ie, if
several nodes run that software, the "age", intactness and uptodateness
of local configuration copies etc also play in.)
The property b) is particularly interesting: It simplifies version
compatibility a whole lot, because we only need to implement backwards
compatibility for the master to client (as for that particular
transition) code path, not for the client-to-master path.
While we obviously don't have much experience with backwards
compatibility yet (I was thinking about introducing a deliberate
non-backwards compatible change somewhere in the development series just
to test that ;-), the other properties have been very good so far.
> > If it is, what prevents GFS from getting a DLM lock granted and writing
> > to the shared storage before the node that previously had it is fenced?
> In my opinion, using the dlm to protect the shared storage resource
> constitutes tackling the problem far too high up on the food chain.
Au contraire. Something pretty high up the food chain needs to decide
resource ownership / access permissions.
> > PS if an application is writing to local storage, what does it need
> > a DLM for?
> Good instinct. In fact, as I've said before, you don't necessarily
> need a dlm in a cluster application at all. What you need is _global
> synchronization_, however that is accomplished. For example, I have
> found it simpler and more efficient to use network messaging for the
> cluster applications I've tackled so far. This suggests to me that
> the dlm is going to end up pretty much as a service needed only by a
> cfs, and not much else. The corollary of that is, we should
> concentrate on making the dlm work well for the cfs, and not get too
> wrapped up in trying to make it solve every global synchronization
> problem in the world.
This isn't quite true. It _is_ true that essentially every locking /
coordination mechanism can be mapped to every other one: locks can be
implemented via barriers, barriers via locks, locks via messages, global
ordering via barriers, voting via locks, locks via voting, and then
there's all the different possible kinds of locks... And internally,
they might well share certain code paths. (Ie, a DLM electing a
transition master for lockspace recovery internally via voting, because
obviously _it_ can't use itself to pull itself out of the mess ;-)
However, for most problems, there's one approach which makes the problem
you're dealing with much easier to express, and potentially more
efficient.
A good cluster suite should offer support for barriers, locking, voting
and (group) messaging with extended virtual synchrony, so that the
problems can be addressed correctly. Compare how the kernel offers more
than one form for synchronization on SMP systems: semaphores,
spinlocks, RCU, atomic operations...
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business
On Saturday 30 April 2005 06:32, Lars Marowsky-Bree wrote:
> > > If it is, what prevents GFS from getting a DLM lock granted and writing
> > > to the shared storage before the node that previously had it is fenced?
> >
> > In my opinion, using the dlm to protect the shared storage resource
> > constitutes tackling the problem far too high up on the food chain.
>
> Au contraire. Something pretty high up the food chain needs to decide
> resource ownership / access permissions.
You confused "high up the food chain" with "being intelligent". Nothing
prevents your oldest cluster node from making complex decisions, or calling
into play other services in order to make those decision. It could, for
example, invoke some server on some other node to make the actual decision.
The important thing is to always have an easy _starting point_ for making
decisions. I do not think that dlm-based algorithms are suitable in that
regard, because of the unavoidable code complexity just to kick off the
process. And obviously, there already is some reliable starting point or
cman would not work. So let's just expose that and have a better cluster
stack.
> > > PS if an application is writing to local storage, what does it need
> > > a DLM for?
> >
> > Good instinct. In fact, as I've said before, you don't necessarily
> > need a dlm in a cluster application at all. What you need is _global
> > synchronization_, however that is accomplished. For example, I have
> > found it simpler and more efficient to use network messaging for the
> > cluster applications I've tackled so far. This suggests to me that
> > the dlm is going to end up pretty much as a service needed only by a
> > cfs, and not much else. The corollary of that is, we should
> > concentrate on making the dlm work well for the cfs, and not get too
> > wrapped up in trying to make it solve every global synchronization
> > problem in the world.
>
> This isn't quite true. It _is_ true that essentially every locking /
> coordination mechanism can be mapped to every other one: locks can be
> implemented via barriers, barriers via locks, locks via messages, global
> ordering via barriers, voting via locks, locks via voting, and then
> there's all the different possible kinds of locks... And internally,
> they might well share certain code paths. (Ie, a DLM electing a
> transition master for lockspace recovery internally via voting, because
> obviously _it_ can't use itself to pull itself out of the mess ;-)
But note that it _can_ use the oldest cluster member as a recovery master, or
to designate a recovery master. It can, and should - there is no excuse for
making this any more complex than it needs to be.
> However, for most problems, there's one approach which makes the problem
> you're dealing with much easier to express, and potentially more
> efficient.
True. My point is that the sweet spot for cluster synchronization is, in my
experience, usually _not_ the dlm.
> A good cluster suite should offer support for barriers, locking, voting
> and (group) messaging with extended virtual synchrony, so that the
> problems can be addressed correctly. Compare how the kernel offers more
> than one form for synchronization on SMP systems: semaphores,
> spinlocks, RCU, atomic operations...
All agreed, except that voting and group messaging should be provided as
separate services in the style of plugins, on which the base cluster
infrastructure does not depend.
OK, once we have found a way to plug in voting and whiz-bang messaging, I
think we have discovered the mythical "vcs", the virtual cluster switch. It
doesn't need to be in-kernel though.
Regards,
Daniel
On 2005-04-29T04:25:24, Daniel Phillips <[email protected]> wrote:
> > It makes a whole lot of sense to combine a DLM with (appropriate)
> > fencing so that the shared resources are protected. I understood David's
> > comment to rather imply that fencing is assumed to happen outside the
> > DLM's world in a different component; ie more of a comment on sane
> > modularization instead of sane real-world configuration.
>
> But just because fencing is supposed to happen in an external component,
> we can't wave our hands at it and skip the analysis. We _must_ identify the
> fencing assumptions and trace the fencing paths with respect to every
> recovery algorithm in every cluster component, including the dlm.
"A fenced node no longer has access to any shared resource".
Is there any other assumption you have in mind?
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business -- Charles Darwin
"Ignorance more frequently begets confidence than does knowledge"
On 2005-04-30T07:12:46, Daniel Phillips <[email protected]> wrote:
> process. And obviously, there already is some reliable starting point or
> cman would not work. So let's just expose that and have a better cluster
> stack.
Most memberships internally construct such a fixed starting point from
voting or other 'chatty' techniques.
This is exposed by the membership (providing all nodes in the same order
on all nodes), however the node level membership does not necessarily
reflect the service/application level membership. So to get it right,
you essentially have to run such an algorithm at that level again too.
True enough it would be helpful if the group membership service provided
such, but here we're at the node level.
> But note that it _can_ use the oldest cluster member as a recovery
> master, or to designate a recovery master. It can, and should - there
> is no excuse for making this any more complex than it needs to be.
The oldest node might not be running that particular service, or it
might not be healthy. To figure that out, you need to vote.
This is straying a bit from LKML issues, maybe it ought to be moved to
one of the clustering lists.
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business -- Charles Darwin
"Ignorance more frequently begets confidence than does knowledge"
On 2005-04-28T23:49:09, Daniel Phillips <[email protected]> wrote:
> > ...Nothing you have said is remotely correct.
> Please provide corrections, if you wish Dave.
I'd really like to see those. Because right now some parts of the
discussion seem to contradict eachother (sometimes subtle so it's hard
to point the finger at it and ask, sometimes glaringly obvious), and I
thought you both worked on the same project. ;-)
That makes it kind-of-hard to understand where you're coming from and
where you're headed. I know that people even on the same project have
different opinions, but before a review, let's have a barrier and
distributed synchronization, ok? ;-)
Sincerely,
Lars Marowsky-Br?e <[email protected]>
--
High Availability & Clustering
SUSE Labs, Research and Development
SUSE LINUX Products GmbH - A Novell Business -- Charles Darwin
"Ignorance more frequently begets confidence than does knowledge"
On Monday 02 May 2005 16:51, Lars Marowsky-Bree wrote:
> On 2005-04-30T07:12:46, Daniel Phillips <[email protected]> wrote:
> > process. And obviously, there already is some reliable starting point or
> > cman would not work. So let's just expose that and have a better cluster
> > stack.
>
> Most memberships internally construct such a fixed starting point from
> voting or other 'chatty' techniques.
But running a whole voting algorithm from square one makes no sense at all,
because cman has already taken care of the first step. Cman just fails to
expose the result in the obvious way. (I believe this remains the case in
the current code - Patrick, could you confirm or deny please, and could we
please have a pointer to the latest incarnation of cman?).
Now, please actually take a look at one of those voting schemes and chances
are, you'll just see a perverse way of picking the lowest-numbered node. But
cman already knows which one that is, and even better, it knows the exact
order each node joined the cluster. So does every other node!
So we can just allow the oldest cluster node to supervise a full-fancy
election (if indeed anything fancy is needed) or if it is too lazy for that,
merely to designate the actual election master and then go back to whatever
it was doing. In this way, we compress dozens of lines of
hard-to-read-and-maintain boilerplate cluster code running on multiple nodes
and taking up valuable recovery time... into... _nothing_.
See?
So let's lose the "chatty" code and use the sensible, obvious approach that
cman already uses for itself.
> This is exposed by the membership (providing all nodes in the same order
> on all nodes), however the node level membership does not necessarily
> reflect the service/application level membership. So to get it right,
> you essentially have to run such an algorithm at that level again too.
Yessirree! But please lets get the easy base-level thing above out of the way
first, then we can take a good hard look at how service groups need to work
in order to be simple, sane, etc. Note: what we want is not so different
from how cman _already_ handles service groups. Basically: take the oldest
node concept (aka stable node enumeration) and apply it to service groups as
well. Then we need events from the service groups, just like the main
cluster membership (which is in effect, an anonymous service group that all
cluster nodes must join before they can join any other service group). To be
sure, cman is more-or-less designed _and documented_ this way already - we
just need to do a few iterative improvements to turn it into a truly sensible
gizmo.
> True enough it would be helpful if the group membership service provided
> such, but here we're at the node level.
It does, we just need to extract the diamond from the, ehm, rough ground.
> > But note that it _can_ use the oldest cluster member as a recovery
> > master, or to designate a recovery master. It can, and should - there
> > is no excuse for making this any more complex than it needs to be.
>
> The oldest node might not be running that particular service, or it
> might not be healthy. To figure that out, you need to vote.
Not necessary! Remember, we also have service groups. Membership in each
service group can (read: should) follow the same rules as cluster membership,
and offers a similar, stable enumeration. That is, the oldest member of each
service group is, by default, the boss. Therefore, except for certain
recovery intervals that have to be handled by barriers, every node in the
cluster always knows the identity of the boss of a particular service group.
> This is straying a bit from LKML issues, maybe it ought to be moved to
> one of the clustering lists.
It is very much on-topic here, and thankyou for driving it.
The reason this infrastructure track is on topic is, without this background,
no core maintainer has the context they need to know why we think things
should be done one way versus another in (g)dlm, let alone the incoming gfs
code.
In the end, we will hatch a lovely kernel API, but not if we cluster mavens
are the only ones who actually have a sense of direction. Left to discuss
the issues only amongst ourselves, we would likely end up with little more
than eternal out-of-tree status.
Regards,
Daniel
On Monday 02 May 2005 16:45, Lars Marowsky-Bree wrote:
> On 2005-04-29T04:25:24, Daniel Phillips <[email protected]> wrote:
> > > It makes a whole lot of sense to combine a DLM with (appropriate)
> > > fencing so that the shared resources are protected. I understood
> > > David's comment to rather imply that fencing is assumed to happen
> > > outside the DLM's world in a different component; ie more of a comment
> > > on sane modularization instead of sane real-world configuration.
> >
> > But just because fencing is supposed to happen in an external component,
> > we can't wave our hands at it and skip the analysis. We _must_ identify
> > the fencing assumptions and trace the fencing paths with respect to every
> > recovery algorithm in every cluster component, including the dlm.
>
> "A fenced node no longer has access to any shared resource".
>
> Is there any other assumption you have in mind?
Nice problem statement. Now we just need to see the proof that we satisfy
this requirement for every cluster service, application, block device, etc
for every possible cluster configuration and normal or failure state.
My assumption is that we will achieve this in a way that is efficient, easy to
configure and not prone to deadlock, with emphasis on the "will".
Regards,
Daniel
On Mon, May 02, 2005 at 11:00:15PM +0200, Lars Marowsky-Bree wrote:
> > > ...Nothing you have said is remotely correct.
> > Please provide corrections, if you wish Dave.
>
> I'd really like to see those. Because right now some parts of the
> discussion seem to contradict eachother (sometimes subtle so it's hard
> to point the finger at it and ask, sometimes glaringly obvious), and I
> thought you both worked on the same project. ;-)
>
> That makes it kind-of-hard to understand where you're coming from and
> where you're headed. I know that people even on the same project have
> different opinions, but before a review, let's have a barrier and
> distributed synchronization, ok? ;-)
If you're still reading what Daniel sends then I can't help much, I was
completely serious before. I've wasted too many hours in the past
rebutting and correcting everything he says and gave up long ago, sorry.
No, he doesn't work on the same project. He has nothing to do with this
software and doesn't understand the first thing about it.
If you have questions, please ask. If you don't want conflicting replies
then I simply suggest ignoring either me or Daniel Phillips; it's
completely up to you.
Dave
Hi,
On Sat, 2005-04-30 at 10:09, Daniel Phillips wrote:
> As you know, this is how I currently determine ownership of such resources as
> cluster snapshot metadata and ddraid dirty log. I find the approach
> distinctly unsatisfactory. The (g)dlm is rather verbose to use, particularly
> taking into the account the need to have two different state machine paths,
> depending on whether a lock happens to master locally or not, and the need to
> coordinate a number of loosely coupled elements: lock status blocks, asts,
> the calls themselves. The result is quite a _long_ and opaque program to do
> a very simple thing.
Why on earth do you need to care where a lock is mastered? Use of ASTs
etc. should be optional, too --- you can just use blocking variants of
the lock primitives if you want. There's a status block, sure, but you
can call the lock grant function synchronously and the status block is
guaranteed unambiguously to be filled on return.
So the easy way to use the DLM for metadata ownership is simply to have
a thread which tries, synchronously, to acquire an EX lock on a
resource. You get it or you stay waiting; when you get it, you own the
metadata. Pretty simple. (The only real complication in this
particular case is how to deal with the resource going away while you
wait, eg. unmount.)
> And indeed, instinct turns out to be correct: there is a far simpler way to
> handle this: let the oldest member of the cluster decide who owns the
> metadata resources.
Deciding who owns it is one thing. You still need the smarts to work
out if recovery is *already* in progress somewhere, and to coordinate
wakeup of whoever you've granted the new metadata ownership to, etc.
Using a lock effectively gets you much of that for free, once you've
done the work to acquire the EX lock in the first place.
> Good instinct. In fact, as I've said before, you don't necessarily need a dlm
> in a cluster application at all. What you need is _global synchronization_,
> however that is accomplished. For example, I have found it simpler and more
> efficient to use network messaging for the cluster applications I've tackled
> so far.
Yes, there is definitely room for both. In particular, the more your
application looks like client/server, the less appropriate a DLM is.
> This suggests to me that the dlm is going to end up pretty much as
> a service needed only by a cfs, and not much else.
But once you've got a CFS, it suddenly becomes possible to do so much
more in user-space in a properly distributed fashion, rather than via
client/server. Cluster-wide /etc/passwd? Just lock for read, access
the file, unlock. Things like shared batch/print queues become easier.
And using messaging is often completely the wrong model for such things,
simply because there's no server to send the message to. A DLM will
often be a far better fit for such applications.
> The corollary of that is,
> we should concentrate on making the dlm work well for the cfs, and not get
> too wrapped up in trying to make it solve every global synchronization
> problem in the world.
Trouble is, I think you're mixing problems here. There are two
different problems: whether the DLM locking model is a good primitive to
use for a given case; and whether the specific DLM API in question is a
good fit for the model itself.
And your initial complaints about needing to know local vs. remote
master, dealing with ASTs etc. are really complaints about the API, not
the model. Using blocking, interruptible APIs gets rid of the AST issue
entirely for applications that don't need that level of complexity. And
you obviously want to have an API variant that doesn't care where the
lock gets mastered --- for one thing, a remotely mastered lock can turn
into a locally mastered one after a cluster membership transition.
So let's keep the two separate. Sure, there will be cases where a DLM
model is more or less appropriate; but given that there are cases where
the model does work, what are the particular unnecessary complications
that the current API forces on us? Remove those and you've made the DLM
model a lot more attractive to use for non-CFS applications.
--Stephen
Hi Stephen,
On Thursday 05 May 2005 08:25, Stephen C. Tweedie wrote:
> Hi,
>
> On Sat, 2005-04-30 at 10:09, Daniel Phillips wrote:
> > As you know, this is how I currently determine ownership of such
> > resources as cluster snapshot metadata and ddraid dirty log. I find the
> > approach distinctly unsatisfactory. The (g)dlm is rather verbose to use,
> > particularly taking into the account the need to have two different state
> > machine paths, depending on whether a lock happens to master locally or
> > not, and the need to coordinate a number of loosely coupled elements:
> > lock status blocks, asts, the calls themselves. The result is quite a
> > _long_ and opaque program to do a very simple thing.
>
> Why on earth do you need to care where a lock is mastered?
That is just my point. I wish I did not have to care. But gdlm behaves
differently - returns status in different ways - depending on whether a lock
is mastered locally or not.
> Use of ASTs
> etc. should be optional, too --- you can just use blocking variants of
> the lock primitives if you want. There's a status block, sure, but you
> can call the lock grant function synchronously and the status block is
> guaranteed unambiguously to be filled on return.
Writing non-trivial code that is supposed to perform well under parallel loads
is practically impossible with the blocking variants. As far as I can see,
for nontrivial applications, the blocking calls are just "training wheels"
for the real api,
> So the easy way to use the DLM for metadata ownership is simply to have
> a thread which tries, synchronously, to acquire an EX lock on a
> resource. You get it or you stay waiting; when you get it, you own the
> metadata. Pretty simple. (The only real complication in this
> particular case is how to deal with the resource going away while you
> wait, eg. unmount.)
The complication arises from the fact that you then need to advise the rest of
the cluster that you own the metadata. How? LVB, obviously. But then you
run smack into the whole culture of LVB semantics and oddball limitations.
For example, what happens when the owner of a LVB dies, what is the value of
the LVB then? Can we prove that our metadata ownership scheme is still
raceless?
Honestly, using the dlm is a perverse way to establish metadata ownership when
we have a _far_ more direct way to do it. As a fringe benefit, we lose the
dlm dependency from a whole batch of cluster components, for example, all the
cluster block devices. This is clearly the Right Thing to Do[tm].
> > And indeed, instinct turns out to be correct: there is a far simpler way
> > to handle this: let the oldest member of the cluster decide who owns the
> > metadata resources.
>
> Deciding who owns it is one thing. You still need the smarts to work
> out if recovery is *already* in progress somewhere, and to coordinate
> wakeup of whoever you've granted the new metadata ownership to, etc.
That is part of the service group recovery protocol ((re)start recovery/halt
recovery/recovery success).
> Using a lock effectively gets you much of that for free, once you've
> done the work to acquire the EX lock in the first place.
I'm afraid that "free" is an illusion here. The metadata ownership code is
very, very ugly using the dlm approach, but is very nice using the cman event
interface, and will become even nicer after we fix cman a little.
The reason for this is, cman membership events fit the problem better. The
dlm api just isn't suited to this. It can be made to fit, but the result is
predictably ugly. We should not lose sight of the fact that the dlm is
actually just implemented on top of a set of cluster synchronization
messages, as is cman. In fact, cman provides the messaging layer that dlm
uses (or it does in the code that I am running here, I have not seen the
rumoured new version of cman yet). So the reason for not using cman directly
is, what exactly?
Or putting it another way, what value does the dlm add to the metadata
ownership code? It sure does not simplify it. And in my opinion, it does
not make it more obviously correct either, quite the contrary.
> > Good instinct. In fact, as I've said before, you don't necessarily need
> > a dlm in a cluster application at all. What you need is _global
> > synchronization_, however that is accomplished. For example, I have
> > found it simpler and more efficient to use network messaging for the
> > cluster applications I've tackled so far.
>
> Yes, there is definitely room for both. In particular, the more your
> application looks like client/server, the less appropriate a DLM is.
True, very true. And metadata ownership problems tend to look just like
client/server problems, even if we break the metadata up into multiple parts
and distribute it around the cluster.
Anyway, it turns out that a significant number of the cluster components have
ended up looking like client/server architectures. For the time being, the
single - and major - exception is the distributed filesystem itself. Oh, and
gdlm and cman, which are self-fullfilling prophesies.
So we have, from the bottom up:
- cman: distributed
- block device export: client/server
- cluster raid: client/server
- cluster snapshot: client/server
- gdlm: distributed
- gfs: distributed
- applications: need more data
We can be fairly sure that the current crop of gfs applications is _not_ using
the dlm, for the simple reason that the dlm has not existed long enough. How
do existing application synchronize then? The truth is, I do not know,
because I have not conducted any survey. However, the one major application
I have looked at porting to gfs already has its synchronization working fine,
using point-to-point socket connections. The only major bit it needs to
become a distributed cluster application is a distributed filesystem. In the
end, there will be no dlm anywhere to be seen at the application level. The
application is, in my humble opinion, better because of this.
> > This suggests to me that the dlm is going to end up pretty much as
> > a service needed only by a cfs, and not much else.
>
> But once you've got a CFS, it suddenly becomes possible to do so much
> more in user-space in a properly distributed fashion, rather than via
> client/server. Cluster-wide /etc/passwd? Just lock for read, access
> the file, unlock. Things like shared batch/print queues become easier.
> And using messaging is often completely the wrong model for such things,
> simply because there's no server to send the message to. A DLM will
> often be a far better fit for such applications.
I think I can do a much better, cleaner job of distributed passwords by
directly using cman service group events. Do you want to see some code for
that?
> > The corollary of that is,
> > we should concentrate on making the dlm work well for the cfs, and not
> > get too wrapped up in trying to make it solve every global
> > synchronization problem in the world.
>
> Trouble is, I think you're mixing problems here. There are two
> different problems: whether the DLM locking model is a good primitive to
> use for a given case; and whether the specific DLM API in question is a
> good fit for the model itself.
Yes.
> And your initial complaints about needing to know local vs. remote
> master, dealing with ASTs etc. are really complaints about the API, not
> the model.
Those complaints were about the api. Other complaints are about the model,
and I have more complaints about the model than the ones I have already
mentioned. For example: the amount of data you can pass around together with
a lock grant is pathetically limited. (Other complaints can wait for the
appropriate thread to pop up.)
> Using blocking, interruptible APIs gets rid of the AST issue
> entirely for applications that don't need that level of complexity. And
> you obviously want to have an API variant that doesn't care where the
> lock gets mastered --- for one thing, a remotely mastered lock can turn
> into a locally mastered one after a cluster membership transition.
We should just lose the variant that cares. There is no efficiency argument
for including that brokenness in the api.
> So let's keep the two separate. Sure, there will be cases where a DLM
> model is more or less appropriate; but given that there are cases where
> the model does work, what are the particular unnecessary complications
> that the current API forces on us? Remove those and you've made the DLM
> model a lot more attractive to use for non-CFS applications.
Yes. I would be perfectly happy to put aside the "alternatives to dlm" thread
and concentrate purely on fixing the dlm api. Please do not misinterpret my
position: we do need a dlm in the cluster stack. Now please let us ensure
that _our_ dlm is a really, really nice dlm with a really, really, nice api.
Regards,
Daniel