Date: Tue, 20 Apr 2004 21:04:21 -0700
From: inaky.perez-gonzalez@intel.com
To: linux-kernel@vger.kernel.org
Cc: inaky.perez-gonzalez@intel.com
Subject: [RFC/PATCH] FUSYN 11/11: user space fulocks
Message-ID: <0404202104.xbfaDbVcjdMdubmcGdHb9boc7cYbOcbd1457@intel.com>
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
In-Reply-To: <0404202104.Ya9btbMbKc5dKcmalc4a8dQccalaAdSa1457@intel.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 33353
Lines: 1051

 ufulock.c | 1044 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 1044 insertions(+)

--- /dev/null	Thu Apr 15 00:58:26 2004
+++ kernel/ufulock.c Thu Apr 8 00:17:49 2004
@@ -0,0 +1,1044 @@
+
+/*
+ * Fast User real-time/pi/pp/robust/deadlock SYNchronization
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on normal futexes (futex.c), (C) Rusty Russell.
+ * Please refer to Documentation/fusyn.txt for more info.
+ *
+ *
+ * The ufulocks are fulocks with a vlocator to help pinpoint them from
+ * a user space address.
+ *
+ * The vfulock is the value in an address in user space that is
+ * associated to the ufulock. The trick here is to keep them in sync
+ * [__vfulock_sync()]. I think it is doing the right thing when we
+ * know about a ufulock in the kernel and something wipes it to some
+ * crappy value in user space (wild pointer)--however, it cannot be
+ * perfect.
+ *
+ * Mapping the address in user space to the ufulock is done through a
+ * hash table kept by the vlocator.h code - ufulock_locate() does the
+ * full work; give it an adress, you get a ufulock. No need to worry
+ * about anything else.
+ *
+ * QUICK ROADMAP:
+ *
+ * sys_ufulock_lock(): [try]lock a fulock from user space
+ * sys_ufulock_unlock(): unlock a fulock from user space
+ * sys_ufulock_ctl(): control the state of a ufulock
+ * __ufulock_exit(): called from fulock.c:exit_fulocks() when a
+ *		     process exits and still holds fulocks.
+ * ufulock_gc(): called from ufu-common.c:ufu_gc() to do garbage
+ *		 collection.
+ *
+ * TODOs:
+ *
+ * - All the #warning FIXME's need to be sorted out
+ *
+ * - There is no need to change the flags of a mutex once it has been
+ *   initialized [POSIX doesn't provide tools for it anyway], except
+ *   for the prioceiling. Still need to provide a way for user space
+ *   to tell kernel space that we are relinquishing a fulock forever
+ *   (for pthread_mutex_destroy()) so we can clean it up and reinit with
+ *   different flags; as well, call in from pthread_mutex_init() so we
+ *   cache the fulock?).
+ */
+
+#warning FIXME: page unpinning still not that done
+/*
+ * The problem is we need to access the page to modify the value on
+ * when a ufulock dies. We could do it on the exit path, but we don't
+ * have a way to locate the page when it is shared on
+ * __ufulock_exit(). We could have the sys_ufulock_lock() return path
+ * do it, but then we don't update the vfulock to VFULOCK_DEAD when
+ * there are no waiters.
+ *
+ * So, until we find a solution for that, pages are still pinned. It
+ * also means we have to pass the page all the way down to
+ * fulock->ops->owner_set(). A solution would get rid of that
+ * too. It's plain ugly and messy.
+ */
+
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/plist.h>
+#include <asm/uaccess.h>    /* copy_from_user() */
+#include <linux/init.h>
+
+#include <linux/futex.h>    /* futex keys */
+#include <linux/vlocator.h>
+#include <linux/workqueue.h>
+#include <linux/fulock.h>
+
+#warning DEBUG: remove me
+//#include <asm/kgdb.h>
+
+#if DEBUG > 5
+#define __DEBUG_get_value(result, _vfulock)			\
+do {								\
+ unsigned val = *_vfulock;					\
+ debug ("result %d * _vfulock %u/%x\n", result, val, val);	\
+} while (0)
+#define DEBUG_get_value(result, _vfulock)			\
+do {								\
+  unsigned val;							\
+  get_user (val, _vfulock);					\
+  debug ("result %d * _vfulock %u/%x\n", result, val, val); \
+} while (0)
+#else
+#define __DEBUG_get_value(a,b) do {} while (0)
+#define DEBUG_get_value(a,b) do {} while (0)
+#endif
+
+extern signed long ufu_timespec2jiffies (const struct timespec __user *);
+
+/** Slab for allocation of 'struct ufulock's. */
+static kmem_cache_t *ufulock_slab; 
+
+
+struct fulock_ops ufulock_ops;
+
+
+/** Set an ufulock ownership and increment its use count. */
+static inline
+unsigned __ufulock_op_owner_set (struct fulock *fulock, struct task_struct *task)
+{
+	unsigned prio_changed;
+	struct ufulock *ufulock;
+	
+	ftrace ("(%p, %p [%d])\n", fulock, task, task->pid);
+	ufulock = container_of (fulock, struct ufulock, fulock);
+	prio_changed = __fulock_op_owner_set (fulock, task);
+	vl_get (&ufulock->vlocator);
+	return prio_changed;
+}
+
+
+/** Reset an ufulock ownership and drop its use count. */
+static inline
+unsigned __ufulock_op_owner_reset (struct fulock *fulock)
+{
+	unsigned prio_changed;
+	struct ufulock *ufulock;
+	
+	ufulock = container_of (fulock, struct ufulock, fulock);
+	ftrace ("(%p [ufulock %p])\n", fulock, ufulock);
+	prio_changed = __fulock_op_owner_reset (fulock);
+	vl_put (&ufulock->vlocator);
+	return prio_changed;
+}
+
+
+/** Initialize an @ufulock (for slab object construction). */
+static
+void ufulock_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+	struct ufulock *ufulock = obj;
+	__fulock_init (&ufulock->fulock, &ufulock_ops);
+	vlocator_init (&ufulock->vlocator);
+}
+
+
+/** Allocate an ufulock maintaining debug-allocation counts. */
+static __inline__
+struct vlocator * ufulock_alloc (void)
+{
+	struct ufulock *ufulock;
+	ufulock = kmem_cache_alloc (ufulock_slab, GFP_KERNEL);
+	if (DEBUG > 0 && ufulock != NULL)
+		ldebug (2, "ufulock %p allocated, total: %d\n",
+			ufulock, allocated_inc());
+	return &ufulock->vlocator;
+}
+
+
+/**
+ * Create a ufulock that is going to be inserted to the hash
+ * [called from ufu-common.c:ufu_locate()]. Reference its resources
+ * and initialize what is not initialized by the cache constructor.
+ */
+static
+void ufulock_create (struct vlocator *vl, struct page *page,
+		     const union futex_key *key, unsigned flags)
+{
+	struct ufulock *ufulock = container_of (vl, struct ufulock, vlocator);
+	ufulock->fulock.flags = (flags & FULOCK_FL_USER_MK)
+		| FULOCK_FL_NEEDS_SYNC;
+#warning FIXME: if PP, transform flags from POLICY+PRIO to RAWPRIO.
+	memcpy (&vl->key, key, sizeof (*key));
+	get_key_refs (&vl->key);
+	get_page (page);
+	ufulock->page = page;
+}
+
+
+/** Release @ufulock's resources. */
+static __inline__
+void ufulock_release (struct vlocator *vl)
+{
+	struct ufulock *ufulock = container_of (vl, struct ufulock, vlocator);
+	BUG_ON (ufulock->page == NULL);
+	put_page (ufulock->page);
+	drop_key_refs (&vl->key);
+}
+
+
+/** Free an ufulock maintaining debug-allocation counts. */
+static __inline__
+void ufulock_free (struct vlocator *vl)
+{
+	struct ufulock *ufulock = container_of (vl, struct ufulock, vlocator);
+	kmem_cache_free (ufulock_slab, ufulock);
+	if (DEBUG > 0 && ufulock != NULL)
+		ldebug (2, "ufulock %p freed, total: %d\n",
+			ufulock, allocated_dec());
+}
+
+
+struct vlocator_ops ufulock_vops = {
+	.alloc = ufulock_alloc,
+	.create = ufulock_create,
+	.release = ufulock_release,
+	.free = ufulock_free
+};
+
+
+
+/**
+ * Set @vfulock, maintaining consistency with @fulock.
+ *
+ * We just make sure that whichever value we put doesn't override the
+ * VFULOCK_DEAD or VFULOCK_NR indicators.
+ */
+static __inline__
+void __vfulock_update (volatile unsigned *vfulock,
+		       const struct fulock *fulock, unsigned value)
+{
+	ftrace ("(%p, %p, %u/%x)\n", vfulock, fulock, value, value);
+	
+	if (fulock->flags & FULOCK_FL_DEAD)
+		value = VFULOCK_DEAD;
+	else if (fulock->flags & FULOCK_FL_NR)
+		value = VFULOCK_NR;
+	vfulock_set (vfulock, value);
+	return;
+}
+
+
+/** @returns expected value of the vfulock given the state of the
+ * @fulock. */ 
+static inline
+unsigned __vfulock_expected_nonkco (struct fulock *fulock)
+{
+	if (fulock->flags & FULOCK_FL_DEAD)
+		return VFULOCK_DEAD;
+	else if (fulock->flags & FULOCK_FL_NR)
+		return VFULOCK_NR;
+	else if (fulock->owner == NULL)
+		return VFULOCK_UNLOCKED;
+	else if (__fulock_empty (fulock) && fulock->owner == NULL)
+		return fulock->owner->pid;
+	else
+		return VFULOCK_WP;
+}
+
+
+/** 
+ * Perform owner identification chores for __vfulock_sync()
+ *
+ * Probably one of the dirtiest functions I've ever written. The thing
+ * that makes it complex is that after we assign ownership, somebody
+ * the target task might start to exit, and we don't know if it has
+ * gone through exit_fulocks() after we added the fulock to its
+ * ownership list, so we have to be paranoid.
+ *
+ * FIXME: false positives if the PID is reused are the rule. Remedies:
+ *
+ * - The chance of that happening will depend on what the reuse rate is,
+ *   how frequently are we assigning new PIDs and the probability of
+ *   the sucker dying in the middle of it (so a well-coded program
+ *   should be fine, except for the OOM killer, or a crazy process
+ *   killing -9 randomly...).
+ *
+ *   Let's say the probability is P (one in X)
+ *
+ * - Limit PIDs to 64Ki - 3 (to account for _NR, _DEAD and _WP), create
+ *   a 16 bit hash for each task with immutable elements (eg: pointer,
+ *   start time...); compose the hash and the 16 bit pid into a 32 bit
+ *   thing that the user space uses for fast-locking. When id'ing,
+ *   locate the task by PID, hash it out and compare it; if it fails,
+ *   the PID was reused, if not, keep on:
+ *
+ *   (there are still chances of collision, but I think we have cut
+ *   P down by 64Ki)
+ *   
+ * - Check the mm mappings. If the task doesn't have the mapping where
+ *   the fulock is, well, that was a collision (and that was chance,
+ *   probably like shoting up to the sky in the middle of the dessert,
+ *   the bullet falling back to the ground, hitting this man [who
+ *   happened to be wandering around all lost] in the forehead,
+ *   killing him, and it happens he is some rotten-rich guy who had
+ *   randomly chosen somebody on the phone book to pass on all his
+ *   wealth and that one happens to be you).
+ * 
+ * - Now, if the task has the same mapping, we assume it is good.
+ *
+ * I think I cannot be more of a pain in the ass, and probably the
+ * mapping thing is overkill; the hash thing looks like more than
+ * enough, and it is pretty cheap (wasn't computing an exact science?
+ * my flaming bullas).
+ * 
+ * If still not good enough to you, use KCO mode, sacrificing speed.
+ */
+static
+int __fulock_id_owner (struct fulock *fulock, volatile unsigned *vfulock,
+		       unsigned owner_pid, int do_lock)
+{
+	int result = EBUSY;
+	struct task_struct *task;
+
+	ftrace ("(%p, %p, %u, %d)\n", fulock, vfulock, owner_pid, do_lock);
+	CHECK_IRQs();
+	
+	BUG_ON (fulock->owner != NULL);
+	_raw_read_lock (&tasklist_lock);
+	task = find_task_by_pid (owner_pid);
+	if (task == NULL || task->flags & PF_EXITING) 
+		goto dead_unlock;
+	get_task_struct (task);
+	_raw_read_unlock (&tasklist_lock);
+	__ufulock_op_owner_set (fulock, task);
+	if (unlikely (task->flags & PF_EXITING)) {
+		__ufulock_op_owner_reset (fulock);
+		put_task_struct (task);
+		goto dead;
+	}
+	put_task_struct (task);
+	return result;
+
+dead_unlock:
+	_raw_read_unlock (&tasklist_lock);
+dead:
+	result = -EOWNERDEAD;
+	vfulock_set (vfulock, VFULOCK_DEAD);
+	fulock->flags |= FULOCK_FL_DEAD;
+	if (do_lock)
+		__ufulock_op_owner_set (fulock, current);
+	return result;
+}
+
+
+/** 
+ * Sync up a ufulock that supports fast-userspace locking and its associated
+ * vfulock and maybe fast-lock.
+ *
+ * @uf: The ufulock to sync up.
+ * @vfulock: pointer to a kmapped associated value.
+ * @returns: < 0 errno code on error [we did not acquire it]
+ *	     -EOWNERDEAD [previous owner died, we got it]
+ *	     0 if we locked it to a PID (and thus 'current' is owner)
+ *	     > 0 if we didn't lock it, need to proceed to fulock
+ *
+ * Call with uf->fulock.fuqueue.lock held!
+ *
+ * This syncs up the vfulock and the ufulock (if the ufulock is new,
+ * sync from vfulock->ufulock, or the opposite). It tries to leave the
+ * ufulock as it if where a kernel fulock, for the fulock layer to
+ * operate on it.
+ *
+ * When we see VFULOCK_UNLOCKED and move to PID, we don't set the
+ * owner, because there will be nobody coming down to the kernel to
+ * reset it [at any effect, it is like if we had done a user space
+ * fast-lock operation].
+ *
+ * The vfulock can only be modified in the following ways:
+ * - In user space:
+ *   - when fast-locking: VFULOCK_UNLOCKED to PID
+ *   - when fast-unlocking: PID to VFULOCK_UNLOCKED
+ *   - when reinitializing from not-recoverable: VFULOCK_NR to VFULOCK_UNLOCKED
+ *   - Illegally: any other transition [mistake, error, wild pointer]
+ * - In kernel space: When we have the lock over the ufulock
+ *   associated to that vfulock.
+ *
+ * That means that as we are called with the lock held, we can modify
+ * the vfulock at will knowing that nobody is going to touch it as
+ * long as it is not VFULOCK_UNLOCKED (fast-lock), a PID
+ * (fast-unlock)--the not-recoverable case is not of concern because
+ * as soon as anyone sees it, they bail out, so we never get to modify
+ * it.
+ *
+ * Now, for those legal-from-user-space modifications, we operate
+ * atomic; the while(1) loop and vfulock_acas() protect against this.
+ */
+static
+int __vfulock_sync_nonkco (struct ufulock *ufulock, volatile unsigned *vfulock,
+			   unsigned do_lock)
+{
+	int result = 0;
+	unsigned val, expected = 0, sync_k2u = 0;
+
+	ftrace ("(%p, %p, %u)\n", ufulock, vfulock, do_lock);
+
+	if ((ufulock->fulock.flags & FULOCK_FL_NEEDS_SYNC) == 0) {
+		sync_k2u = 1;		       /* synch kernel-to-user */
+		expected = __vfulock_expected_nonkco (&ufulock->fulock);
+	}
+	while (1) {
+		val = *vfulock;
+		//		kgdb_ts ((unsigned long) ufulock, *vfulock);
+		if (sync_k2u && val != expected) { /* Unexpected changes? */
+			printk (KERN_WARNING "pid %d (%s) (ufulock %p"
+				"): out of sync, val %u/0x%x, "
+				"expected %u/0x%x. Fixing.\n",
+				current->pid, current->comm,
+				ufulock, val, val, expected, expected);
+			if (expected == VFULOCK_UNLOCKED && val < VFULOCK_WP)
+				goto make_kco; /* Legal: locked? */
+			if (expected < VFULOCK_WP && val == VFULOCK_UNLOCKED)
+				goto fast_lock; /* Legal: unlocked? */
+			if (!vfulock_acas (vfulock, val, expected))
+				continue;      /* Illegal: reset it */
+			val = expected;
+		}
+		switch (val) {
+		case VFULOCK_UNLOCKED:     /* unlocked, fast-lock it */
+fast_lock:
+			//			kgdb_ts ((unsigned long) uf, *vfulock);
+			if (do_lock == 0)
+				goto out;
+			//			kgdb_ts ((unsigned long) uf, *vfulock);
+			if (__fulock_empty (&ufulock->fulock)) {
+				if (vfulock_acas (vfulock, val, current->pid))
+					goto out;
+			}
+			else if (vfulock_acas (vfulock, val, VFULOCK_WP)) {
+				result = EBUSY;
+				goto out_synced;
+			}
+			break;
+		case VFULOCK_DEAD:	       /* idem, but dead owner */
+			ufulock->fulock.flags |= FULOCK_FL_DEAD;
+		case VFULOCK_WP:	       /* kernel controlled */
+			result = EBUSY;
+			goto out_synced;
+		case VFULOCK_NR:	       /* gee...gone completely */
+			ufulock->fulock.flags |= FULOCK_FL_NR;
+			result = -ENOTRECOVERABLE;
+			goto out_synced;
+		default:		       /* This is a PID(locked) no waiters */
+make_kco:
+			if (vfulock_acas (vfulock, val, VFULOCK_WP))
+				goto id_owner;	       
+		}
+	}			 
+id_owner:		       /* Id owner, mark as dead if it died */
+	result = __fulock_id_owner (&ufulock->fulock, vfulock, val, do_lock);
+out_synced:
+	ufulock->fulock.flags &= ~FULOCK_FL_NEEDS_SYNC;
+out:			       /* We are the sole owners of the lock */
+	__DEBUG_get_value (result, vfulock);
+#warning DEBUG: remove me
+	//	if (result == 0)
+	//		kgdb_ts ((unsigned long) ufulock, *vfulock);
+	return result;
+}
+
+
+/**
+ * @returns expected value of the vfulock given the state of the
+ * @fulock for KCO mode fulocks.
+ */ 
+static inline
+unsigned __vfulock_expected_kco (struct fulock *fulock)
+{
+	if (fulock->flags & FULOCK_FL_DEAD)
+		return VFULOCK_DEAD;
+	else if (fulock->flags & FULOCK_FL_NR)
+		return VFULOCK_NR;
+	return VFULOCK_HEALTHY;
+}
+
+
+/**
+ * Synchronize a KCO ufulock with the KCO vfulock in user space.
+ *
+ * KCO ufulocks exist only in kernel space when someone owns them (and
+ * when nobody does, for a while in the cache). When it dissapears, we
+ * need to keep somewhere the state of the fulock (if it was healthy,
+ * dead or not-recoverable). We use the vfulock, and VFULOCK_HEALTHY
+ * (for convenience, the same as VFULOCK_UNLOCKED), VFULOCK_DEAD and
+ * VFULOCK_NR. 
+ *
+ * The checks are just to be a PITA and catch errors; they will also
+ * fix up stuff (being the information in the kernel the authoritative
+ * reference). 
+ */
+static
+int __vfulock_sync_kco (struct ufulock *ufulock, volatile unsigned *vfulock)
+{
+	struct fulock *fulock;
+	int result = EBUSY;
+	unsigned val, new_flags, expected;
+	
+	ftrace ("(%p, %p)\n", ufulock, vfulock);
+
+	fulock = &ufulock->fulock;
+	val = *vfulock;
+	new_flags = fulock->flags & ~FULOCK_FL_NEEDS_SYNC;
+	if ((fulock->flags & FULOCK_FL_NEEDS_SYNC) == 0) {
+		expected = __vfulock_expected_kco (fulock);
+		if (val != expected) {
+ 			printk (KERN_WARNING "pid %d (%s) (ufulock %p"
+				"): out of sync, val %u/0x%x, "
+				"expected %u/0x%x. Fixing.\n",
+ 				current->pid, current->comm,
+ 				ufulock, val, val, expected, expected);
+			__vfulock_update (vfulock, fulock, expected);
+		}
+	}
+	switch (val) {
+	case VFULOCK_HEALTHY:
+		break;
+	case VFULOCK_DEAD:
+		new_flags |= FULOCK_FL_DEAD;
+		break;
+	case VFULOCK_NR:
+		result = -ENOTRECOVERABLE;
+		new_flags |= FULOCK_FL_NR;
+		break;
+		/* Why default to not-recoverable? well somebody
+		 * screwed up the value of this lock, so we can't be
+		 * certain of what was its previous state, so the
+		 * safest thing to do is to kill it left and right.
+		 */
+	default:
+		printk (KERN_WARNING "pid %d (%s) (KCO ufulock %p): "
+			"invalid value 0x%x. Defaulting to not-recoverable.\n",
+			current->pid, current->comm, ufulock, val);
+		__vfulock_update (vfulock, fulock, VFULOCK_NR);
+		new_flags |= FULOCK_FL_NR;
+		__fulock_unlock (fulock, (size_t)~0, -ENOTRECOVERABLE);
+		result = -ENOTRECOVERABLE;
+	}
+	fulock->flags = new_flags;
+	return result;
+}
+
+
+/** 
+ * Sync up a ufulock with the user space vfulock.
+ *
+ * Depending on the type of locking, we go one way or another.
+ */
+static inline
+int __vfulock_sync (struct ufulock *ufulock, volatile unsigned *vfulock,
+		    unsigned do_lock)
+{
+	ftrace ("(%p, %p, %u)\n", ufulock, vfulock, do_lock);
+
+	return ufulock->fulock.flags & FULOCK_FL_KCO? 
+		__vfulock_sync_kco (ufulock, vfulock) :
+		__vfulock_sync_nonkco (ufulock, vfulock, do_lock);
+}
+
+
+/** Helper for kmapping the user's vfulock to the kernel. */
+static inline
+volatile unsigned * vfulock_kmap (struct ufulock *ufulock)
+{
+	ftrace ("(%p)\n", ufulock);
+	BUG_ON (ufulock->page == NULL);
+	return (volatile unsigned *)
+		(kmap_atomic (ufulock->page, KM_IRQ0)
+		 + (ufulock->vlocator.key.both.offset & ~1));
+}
+
+/** Unmap the user's vfulock from the kernel. */
+static inline
+volatile unsigned * vfulock_kunmap (volatile unsigned *vfulock)
+{
+	ftrace ("(%p)\n", vfulock);
+	kunmap_atomic ((void *) (PAGE_MASK & (unsigned long)vfulock), KM_IRQ0);
+}
+	
+
+/**
+ * Lock a ufulock, maybe wait for it to be available.
+ *
+ * @returns: 0 if acquired, < 0 errno code on error.
+ *	     -EAGAIN   Not acquired, try again [change this, conflicst
+ *		       with POSIX]
+ *	     -EBUSY    Not acquired, is locked, didn't block
+ *	     -EBADR    Not acquired, fulock is not recoverable
+ *	     -ESRCH    Acquired, previous owner died, data might be
+ *		       inconsistent
+ *	     *	       Not acquired, error.
+ *
+ * USER CONTEXT ONLY
+ *
+ * What we do is the minimal operations to obtain a fulock that cannot
+ * be distinguished from a kernel-only fulock and then call the fulock
+ * layer for doing the locking.
+ *
+ * For obtaining that fulock, we use the address [ufulock_locate()],
+ * lock it and sync up the vfulock and ufulock [__vfulock_sync()],
+ * then ask the fulock layer to lock for us [__fulock_lock()], if we
+ * have to].
+ */
+int ufulock_lock (struct ufulock *ufulock, unsigned flags,
+		  signed long timeout)
+{
+	volatile unsigned *vfulock;
+	int result;
+	
+	ftrace ("(%p, %x, %ld)\n", ufulock, flags, timeout);
+	might_sleep();
+
+	/* Check flags */
+	spin_lock_irq (&ufulock->fulock.fuqueue.lock);
+	result = -EINVAL;
+	if (flags != (ufulock->fulock.flags & FULOCK_FL_USER_MK))
+		goto out_unlock;
+	/* sync up kernel and user space, maybe fast-lock */
+try_again:
+	vfulock = vfulock_kmap (ufulock);
+	result = __vfulock_sync (ufulock, vfulock, 1);
+	//	kgdb_ts ((unsigned long) ufulock, result);
+	vfulock_kunmap (vfulock);
+	if (result <= 0)		     /* Error or fast-locked */
+		goto out_unlock;
+	result = __fulock_lock (&ufulock->fulock, timeout);
+	if (result == -EAGAIN) {
+		spin_lock_irq (&ufulock->fulock.fuqueue.lock);
+		goto try_again;	       /* Non-serialized wake up */
+	}
+	//	kgdb_ts ((unsigned long) ufulock, result);
+	return result;
+	
+out_unlock:
+	spin_unlock_irq (&ufulock->fulock.fuqueue.lock);
+	//	kgdb_ts ((unsigned long) ufulock, result);
+	return result;
+}
+
+
+/**
+ * Lock a ufulock, maybe wait for it to be available.
+ *
+ * @returns: 0 if acquired, < 0 errno code on error.
+ *	     -EAGAIN   Not acquired, try again [change this, conflicst
+ *		       with POSIX]
+ *	     -EBUSY    Not acquired, is locked, didn't block
+ *	     -EBADR    Not acquired, fulock is not recoverable
+ *	     -ESRCH    Acquired, previous owner died, data might be
+ *		       inconsistent
+ *	     *	       Not acquired, error.
+ *
+ * USER CONTEXT ONLY
+ *
+ * Massage everything, locate an ufulock and lock it with uflock_lock().
+ */
+asmlinkage
+int sys_ufulock_lock (volatile unsigned __user *_vfulock, unsigned flags,
+		      struct timespec __user *_timeout)
+{
+	struct ufulock *ufulock;
+	struct vlocator *vl;
+	signed long timeout;
+	int result = -EINVAL;
+	
+	ftrace ("(%p, 0x%x, %p)\n", _vfulock, flags, _timeout);
+
+	/* Pin page, get key, look up/allocate the ufulock, refcount it */
+	result = vl_locate (NULL, &vl, &ufulock_vops, _vfulock, flags);
+	if (unlikely (result < 0))
+		goto out;
+	ufulock = container_of (vl, struct ufulock, vlocator);
+	timeout = ufu_timespec2jiffies (_timeout);
+	result = (int) timeout;
+	if (unlikely (timeout < 0))
+		goto out_put;
+	//	kgdb_ts ((unsigned long) _vfulock, (unsigned long) ufulock);
+	result = ufulock_lock (ufulock, flags, timeout);
+out_put:
+	vl_put (vl);
+out:
+	DEBUG_get_value (result, _vfulock);
+	return result;
+}
+
+
+/**
+ * Sync up an unfulock, then unlock it.
+ *
+ * Note the gimmick: when it is an unserialized unlock (howmany > 0),
+ * we need to declare the fulock FULOCK_FL_NEEDS_SYNC so the next time
+ * somebody gets into __vfulock_sync_nonkco(), it will properly sync
+ * from userspace.
+ * 
+ * @ufulock: ufulock structure.
+ * @flags:   flags from users pace for the ufulock
+ * @howmany: Wake up this many waiters; if 0, wake up only one,
+ *	     forcing a serialization in the acquisition of the
+ *	     lock, so that no other task (in user or kernel space)
+ *	     can acquire it in the meantime.
+ * @returns: 0 if ok, < 0 errno code on error.
+ */
+int __ufulock_unlock (struct ufulock *ufulock, unsigned flags,
+		      size_t howmany)
+{
+	volatile unsigned *vfulock;
+	int result = -EINVAL;
+	struct fulock *fulock = &ufulock->fulock;
+	
+	ftrace ("(%p, %x, %zu)\n", ufulock, flags, howmany);
+	CHECK_IRQs();
+	
+	if (flags != (fulock->flags & FULOCK_FL_USER_MK))
+		goto out;
+	vfulock = vfulock_kmap (ufulock);
+	result = __vfulock_sync (ufulock, vfulock, 0);
+	if (result < 0)
+		goto out_kunmap;
+	/* Now do a proper unlock and update the vfulock */
+	if (howmany > 0)
+		__vfulock_update (vfulock, fulock, VFULOCK_UNLOCKED);
+	result = __fulock_unlock (fulock, howmany, 0);
+	if (result < 0)
+		goto out_kunmap;
+	else if (howmany == 0) {
+		unsigned new_val;
+		if (fulock->flags & FULOCK_FL_KCO)
+			new_val = VFULOCK_HEALTHY;
+		else if (result == 0)
+			new_val = VFULOCK_UNLOCKED;
+		else if (result == 1) {	  /* enable fast-unlock */
+			new_val = fulock->owner->pid;
+			fulock->flags |= FULOCK_FL_NEEDS_SYNC;
+			__ufulock_op_owner_reset (fulock);
+		}
+		else
+			new_val = VFULOCK_WP;
+		__vfulock_update (vfulock, &ufulock->fulock, new_val);
+	}
+	else
+		fulock->flags |= FULOCK_FL_NEEDS_SYNC;
+	result = 0;
+out_kunmap:
+	vfulock_kunmap (vfulock);
+out:
+	return result;
+}
+
+
+/**
+ * Unlock an ufulock, waking up waiter(s)
+ *
+ * @_vfulock:  Address for the fulock (user space).
+ * @howmany: Number of waiters to wake up [see ufulock_unlock()]
+ * @returns: 0 if ok, < 0 errno code on error.
+ *
+ * USER CONTEXT ONLY
+ *
+ * There is a gloomy thing here. We should be just looking for a
+ * ufulock associated to _vfulock and if not found, assume there are
+ * no waiters and just set *_vfulock to unlocked.
+ *
+ * But by doing that, we can provoke a race condition; if another
+ * thread B just came in, saw it locked (vfulock is locker's pid),
+ * went into the kernel, saw it unlocked and acquired it, then it
+ * would set the vfulock to VFULOCK_WP and the original thread A in
+ * this moment sets the vfulock to unlocked -- havoc happens.
+ *
+ * So we force to do exactly the same thing as lock() do, we locate();
+ * this way, we also force the caching of the fulock to be
+ * refreshed. Then, the setting of the *_vfulock [from inside the
+ * kernel] is protected by the spinlock.
+ */
+asmlinkage
+int sys_ufulock_unlock (volatile unsigned __user *_vfulock, unsigned flags,
+			size_t howmany)
+{
+	struct vlocator *vl;
+	struct ufulock *ufulock;
+	int result = -ENOMEM;
+	unsigned long cpu_flags;
+	
+	ftrace ("(%p, 0x%x, %zu)\n", _vfulock, flags, howmany);
+	
+	/* Pin page, get key, look up/allocate the ufulock, refcount it */
+	result = vl_locate (NULL, &vl, &ufulock_vops, _vfulock, flags);
+	if (unlikely (result < 0))
+		goto out;
+	ufulock = container_of (vl, struct ufulock, vlocator);
+	spin_lock_irqsave (&ufulock->fulock.fuqueue.lock, cpu_flags);
+	result = __ufulock_unlock (ufulock, flags, howmany);
+	spin_unlock_irqrestore (&ufulock->fulock.fuqueue.lock, cpu_flags);
+	vl_put (vl);
+out:
+	DEBUG_get_value (result, _vfulock);
+	return result;
+}
+
+
+/**
+ * (re)Initialize an ufulock.
+ *
+ * @ufulock: who to reinitialize.
+ * @flags: with thiiiis flags.
+ */
+static
+int __ufulock_init (struct ufulock *ufulock, unsigned flags)
+{
+	volatile unsigned *vfulock;
+	
+	ftrace ("(%p, 0x%x)\n", ufulock, flags);
+	/* Initialization will not fail... */
+	__fulock_ctl (&ufulock->fulock, fulock_ctl_init);
+	ufulock->fulock.flags = (flags & FULOCK_FL_USER_MK)
+		| FULOCK_FL_NEEDS_SYNC;
+	vfulock = vfulock_kmap (ufulock);
+	vfulock_set (vfulock, VFULOCK_UNLOCKED);
+	vfulock_kunmap (vfulock);
+	return 0;
+}
+
+
+/**
+ * Run a control command on a fulock.
+ *
+ * @_vfulock: Location of the fulock in user space
+ * @flags: flags for the fulock
+ * @ctl: Command to run (enum fulock_ctl).
+ * @returns: >= 0 previous consistency before the change (enum fulock_st).
+ *	     < 0 errno code on error.
+ *
+ * USER CONTEXT ONLY
+ */
+asmlinkage
+int sys_ufulock_ctl (unsigned __user *_vfulock, unsigned flags, unsigned ctl)
+{
+	struct vlocator *vl;
+	struct fulock *fulock;
+	struct ufulock *ufulock;
+	int result;
+	unsigned new_value;
+	volatile unsigned *vfulock;
+	
+	ftrace ("(%p, 0x%x, %u)\n", _vfulock, flags, ctl);
+	might_sleep();
+
+	/* Ugly special case numero uno: destruction; can't really
+	 * destroy it (somebody might be using it still), but we can
+	 * disconnect it from the hash until the gc destroys it. */
+	if (ctl == fulock_ctl_destroy) {
+		vl_dispose (&ufulock_vops, _vfulock);
+		return 0;
+	}	
+	/* Pin page, get key, look up/allocate the ufulock, refcount it */
+	result = vl_locate (NULL, &vl, &ufulock_vops, _vfulock, flags);
+	if (unlikely (result < 0))
+		goto out;
+	ufulock = container_of (vl, struct ufulock, vlocator);
+	fulock = &ufulock->fulock;
+	spin_lock_irq (&fulock->fuqueue.lock);
+	/* Ugly special case numero two: Initialization */
+	if (ctl == fulock_ctl_init) {
+		result = __ufulock_init (ufulock, flags);
+		goto out_unlock;
+	}
+	/* make sure flags are consistent */
+	result = -EINVAL;
+	if (flags != (fulock->flags & FULOCK_FL_USER_MK))
+		goto out_unlock;
+	/* Ugly special case number three: do we have waiters? */
+	if (ctl == fulock_ctl_waiters) {
+		result = __fulock_empty (fulock)? 0 : 1;
+		goto out_unlock;
+	}
+	/* Ugly special case number four: is it locked? */
+	if (ctl == fulock_ctl_locked) {
+		result = fulock->owner == NULL? 0 : 1;
+		goto out_unlock;
+	}
+	/* Map the user space area */
+	vfulock = vfulock_kmap (ufulock);
+	result = __vfulock_sync (ufulock, vfulock, 0);
+	if (result < 0)
+		goto out_kunmap;
+	/* Set the consistency */
+	result = __fulock_ctl (fulock, ctl);
+	if (result < 0)
+		goto out_kunmap;
+	/* Ok, update the vfulock */
+	switch (result) {
+	case fulock_st_healthy:
+		if (unlikely (fulock->flags & FULOCK_FL_KCO))
+			new_value = VFULOCK_HEALTHY;
+		else if (__fulock_empty (fulock)) {
+			new_value = current->pid;
+			fulock->flags |= FULOCK_FL_NEEDS_SYNC;
+			__ufulock_op_owner_reset (fulock);
+		}
+		else
+			new_value = VFULOCK_WP;
+		break;
+	case fulock_st_dead:
+		new_value = VFULOCK_DEAD;
+		break;
+	case fulock_st_nr:
+		new_value = VFULOCK_NR;
+		break;		      
+	default:
+		WARN_ON(1);
+		new_value = 0; /* shut gcc up */
+	}
+	vfulock_set (vfulock, new_value);
+	__DEBUG_get_value (result, vfulock);
+	result = 0;
+out_kunmap:
+	vfulock_kunmap (vfulock);
+out_unlock:
+	spin_unlock_irq (&fulock->fuqueue.lock);
+	vl_put (vl);
+out:
+	return result;	      
+}
+
+
+/** Release as dead @ufulock because the owner is exiting. */
+void __ufulock_op_exit (struct fulock *fulock)
+{
+	struct ufulock *ufulock;
+	volatile unsigned *vfulock;
+	
+	ftrace ("(%p)\n", fulock);
+
+	ufulock = container_of (fulock, struct ufulock, fulock);
+	vfulock = vfulock_kmap (ufulock);
+	/* Update the vfulock to VFULOCK_DEAD */
+	__vfulock_update (vfulock, fulock, VFULOCK_DEAD);
+	vfulock_kunmap (vfulock);
+	__fulock_op_exit (fulock);
+}
+
+
+/** Cancel @task's wait on the ufulock */
+unsigned __ufulock_op_waiter_cancel (struct fuqueue *fuqueue,
+				     struct fuqueue_waiter *w)
+{
+	unsigned prio_changed;
+	struct fulock *fulock =
+		container_of (fuqueue, struct fulock, fuqueue);
+	struct ufulock *ufulock =
+		container_of (fulock, struct ufulock, fulock);
+	
+	ftrace ("(%p, %p [%d], %p)\n", fuqueue, w, w->task->pid, w);
+	
+	prio_changed = __fulock_op_waiter_cancel (fuqueue, w);
+	if ((fulock->flags & FULOCK_FL_KCO) == 0
+	    && __fulock_empty (fulock)) {
+		/* Re-enable fast unlock */
+		volatile unsigned *vfulock;
+		unsigned value = VFULOCK_UNLOCKED;
+		if (fulock->owner) {
+			value = fulock->owner->pid;
+			ufulock->fulock.flags |= FULOCK_FL_NEEDS_SYNC;
+			__ufulock_op_owner_reset (fulock);
+		}
+		vfulock = vfulock_kmap (ufulock);
+		__vfulock_update (vfulock, fulock, value);
+		vfulock_kunmap (vfulock);
+	}
+	return prio_changed;
+}
+
+
+/* Adaptors for fulock operations */
+static
+void ufulock_put (struct fuqueue *fuqueue) 
+{
+	struct fulock *fulock =
+		container_of (fuqueue, struct fulock, fuqueue);
+	struct ufulock *ufulock =
+		container_of (fulock, struct ufulock, fulock);
+	vl_put (&ufulock->vlocator);
+}
+
+static
+void ufulock_get (struct fuqueue *fuqueue) 
+{
+	struct fulock *fulock =
+		container_of (fuqueue, struct fulock, fuqueue);
+	struct ufulock *ufulock =
+		container_of (fulock, struct ufulock, fulock);
+	vl_get (&ufulock->vlocator);
+}
+
+/** ufulock fulock operations */
+struct fulock_ops ufulock_ops = {
+	.fuqueue = {
+		.get = ufulock_get,
+		.put = ufulock_put,
+		.waiter_cancel = __ufulock_op_waiter_cancel,
+		.waiter_chprio = __fulock_op_waiter_chprio
+	},
+	.owner_set = __ufulock_op_owner_set,
+	.owner_reset = __ufulock_op_owner_reset,
+	.exit = __ufulock_op_exit,
+};
+
+
+/**
+ * Initialize the ufulock subsystem.
+ */
+static
+int __init subsys_ufulock_init (void)
+{
+	ufulock_slab = kmem_cache_create ("ufulock", sizeof (struct ufulock),
+					  0, 0, ufulock_ctor, NULL);
+	if (ufulock_slab == NULL) 
+		panic ("ufulock_init(): "
+		       "Unable to initialize ufulock slab allocator.\n");
+	return 0;
+}
+__initcall (subsys_ufulock_init);
+
+
+/**
+ * Convert a user 'struct timespec *' to jiffies with a twist
+ *
+ * @_timespec: pointer to user space 'struct timespec'.
+ *
+ * @returns: MAX_SCHEDULE_TIMEOUT
+ *		    Wait for ever, if @_timespec is (void *) -1.
+ *	     0	    Don't wait at all (if @_timespec is NULL).
+ *	     Other: Number of jiffies to wait as specified in
+ *		    @_timespec.
+ *
+ * I kind of think this is still a little bit twisted -- raise your
+ * hand if you can think of a better arrangement.
+ *
+ * WARNING: might sleep!
+ */
+signed long ufu_timespec2jiffies (const struct timespec __user *_timespec)
+{
+	struct timespec timespec;
+	signed long j;
+	
+	might_sleep();
+	if (_timespec == NULL)
+		j = 0;
+	else if (_timespec == (struct timespec *) ~0)
+		j = MAX_SCHEDULE_TIMEOUT;
+	else {
+		if (copy_from_user (&timespec, _timespec, sizeof (timespec)))
+			return -EFAULT;
+		j = timespec_to_jiffies (&timespec);
+	}
+	return j;
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/