2002-02-12 23:09:35

by bwatson

[permalink] [raw]
Subject: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

Marcelo-

Attached is a patch that adds trylock routines for read/write
semaphores. David Howells saw it last August and thought it was ready
to be submitted then, but I became distracted and haven't taken the
time to submit it until now. My motivation is that Christoph Hellwig
says he needs it for JFS.

I hammered on this patch with a test kernel based on 2.4.18-pre7. The
test kernel put a read/write semaphore under heavy contention with a
mix of down_read(), down_write(), down_read_trylock(), and
down_write_trylock() calls. I made sure no process got a lock when
it shouldn't, and that no deadlocks occurred. I also made sure it
applies cleanly against 2.4.18-pre9. Considering that it does not
affect the rest of the kernel (since no one calls it), I think it's
safe to include in 2.4.18.


Brian Watson | "Now I don't know, but I been told it's
Linux Kernel Developer | hard to run with the weight of gold,
Open SSI Clustering Project | Other hand I heard it said, it's
Compaq Computer Corp | just as hard with the weight of lead."
Los Angeles, CA | -Robert Hunter, 1970

mailto:[email protected]
http://opensource.compaq.com/


diff -Naur linux-2.4.18-pre9/include/asm-i386/rwsem.h trylock-2.4.18-pre9/include/asm-i386/rwsem.h
--- linux-2.4.18-pre9/include/asm-i386/rwsem.h Tue Feb 12 12:14:14 2002
+++ trylock-2.4.18-pre9/include/asm-i386/rwsem.h Tue Feb 12 12:14:55 2002
@@ -121,6 +121,24 @@
}

/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ signed long old, new;
+
+repeat:
+ old = (volatile signed long)sem->count;
+ if (old < RWSEM_UNLOCKED_VALUE)
+ return 0;
+ new = old + RWSEM_ACTIVE_READ_BIAS;
+ if (cmpxchg(&sem->count, old, new) == old)
+ return 1;
+ else
+ goto repeat;
+}
+
+/*
* lock for writing
*/
static inline void __down_write(struct rw_semaphore *sem)
@@ -148,6 +166,19 @@
: "+d"(tmp), "+m"(sem->count)
: "a"(sem)
: "memory", "cc");
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ signed long ret = cmpxchg(&sem->count,
+ RWSEM_UNLOCKED_VALUE,
+ RWSEM_ACTIVE_WRITE_BIAS);
+ if (ret == RWSEM_UNLOCKED_VALUE)
+ return 1;
+ return 0;
}

/*
diff -Naur linux-2.4.18-pre9/include/linux/rwsem-spinlock.h trylock-2.4.18-pre9/include/linux/rwsem-spinlock.h
--- linux-2.4.18-pre9/include/linux/rwsem-spinlock.h Thu Nov 22 11:46:19 2001
+++ trylock-2.4.18-pre9/include/linux/rwsem-spinlock.h Tue Feb 12 12:14:55 2002
@@ -54,7 +54,9 @@

extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
extern void FASTCALL(__down_read(struct rw_semaphore *sem));
+extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
extern void FASTCALL(__down_write(struct rw_semaphore *sem));
+extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
extern void FASTCALL(__up_read(struct rw_semaphore *sem));
extern void FASTCALL(__up_write(struct rw_semaphore *sem));

diff -Naur linux-2.4.18-pre9/include/linux/rwsem.h trylock-2.4.18-pre9/include/linux/rwsem.h
--- linux-2.4.18-pre9/include/linux/rwsem.h Thu Nov 22 11:46:19 2001
+++ trylock-2.4.18-pre9/include/linux/rwsem.h Tue Feb 12 12:14:55 2002
@@ -46,6 +46,18 @@
}

/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+static inline int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret;
+ rwsemtrace(sem,"Entering down_read_trylock");
+ ret = __down_read_trylock(sem);
+ rwsemtrace(sem,"Leaving down_read_trylock");
+ return ret;
+}
+
+/*
* lock for writing
*/
static inline void down_write(struct rw_semaphore *sem)
@@ -53,6 +65,18 @@
rwsemtrace(sem,"Entering down_write");
__down_write(sem);
rwsemtrace(sem,"Leaving down_write");
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static inline int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret;
+ rwsemtrace(sem,"Entering down_write_trylock");
+ ret = __down_write_trylock(sem);
+ rwsemtrace(sem,"Leaving down_write_trylock");
+ return ret;
}

/*
diff -Naur linux-2.4.18-pre9/lib/rwsem-spinlock.c trylock-2.4.18-pre9/lib/rwsem-spinlock.c
--- linux-2.4.18-pre9/lib/rwsem-spinlock.c Wed Apr 25 13:31:03 2001
+++ trylock-2.4.18-pre9/lib/rwsem-spinlock.c Tue Feb 12 12:14:55 2002
@@ -149,6 +149,28 @@
}

/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int __down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = 0;
+ rwsemtrace(sem,"Entering __down_read_trylock");
+
+ spin_lock(&sem->wait_lock);
+
+ if (sem->activity>=0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->activity++;
+ ret = 1;
+ }
+
+ spin_unlock(&sem->wait_lock);
+
+ rwsemtrace(sem,"Leaving __down_read_trylock");
+ return ret;
+}
+
+/*
* get a write lock on the semaphore
* - note that we increment the waiting count anyway to indicate an exclusive lock
*/
@@ -192,6 +214,28 @@

out:
rwsemtrace(sem,"Leaving __down_write");
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int __down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = 0;
+ rwsemtrace(sem,"Entering __down_write_trylock");
+
+ spin_lock(&sem->wait_lock);
+
+ if (sem->activity==0 && list_empty(&sem->wait_list)) {
+ /* granted */
+ sem->activity = -1;
+ ret = 1;
+ }
+
+ spin_unlock(&sem->wait_lock);
+
+ rwsemtrace(sem,"Leaving __down_write_trylock");
+ return ret;
}

/*


2002-02-12 23:19:56

by Alan

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

> + new = old + RWSEM_ACTIVE_READ_BIAS;
> + if (cmpxchg(&sem->count, old, new) == old)
> + return 1;

cmpxchg isnt present on i386

2002-02-13 02:07:32

by Brian J. Watson

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

Alan Cox wrote:
>
> > + new = old + RWSEM_ACTIVE_READ_BIAS;
> > + if (cmpxchg(&sem->count, old, new) == old)
> > + return 1;
>
> cmpxchg isnt present on i386


According to arch/i386/config.in, the generic spinlock version would be
used for vintage 386 chips. I think that's due to similar concerns about
the xadd instruction.

--
Brian Watson | "Now I don't know, but I been told it's
Linux Kernel Developer | hard to run with the weight of gold,
Open SSI Clustering Project | Other hand I heard it said, it's
Compaq Computer Corp | just as hard with the weight of lead."
Los Angeles, CA | -Robert Hunter, 1970

mailto:[email protected]
http://opensource.compaq.com/

2002-02-13 07:48:06

by David Howells

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores


> > + new = old + RWSEM_ACTIVE_READ_BIAS;
> > + if (cmpxchg(&sem->count, old, new) == old)
> > + return 1;
>
> cmpxchg isnt present on i386

This isn't actually relevant... <asm/rwsem.h> isn't actually used unless
CONFIG_RWSEM_GENERIC_SPINLOCK is turned off, which it isn't if the CPU type is
configured to "i386":

[arch/i386/config.in]
if [ "$CONFIG_M386" = "y" ]; then
define_bool CONFIG_X86_CMPXCHG n
define_bool CONFIG_X86_XADD n
define_int CONFIG_X86_L1_CACHE_SHIFT 4
define_bool CONFIG_RWSEM_GENERIC_SPINLOCK y
define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM n
else
define_bool CONFIG_X86_WP_WORKS_OK y
define_bool CONFIG_X86_INVLPG y
define_bool CONFIG_X86_CMPXCHG y
define_bool CONFIG_X86_XADD y
define_bool CONFIG_X86_BSWAP y
define_bool CONFIG_X86_POPAD_OK y
define_bool CONFIG_RWSEM_GENERIC_SPINLOCK n
define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y
fi

[include/linux/rwsem.h]
#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
#include <linux/rwsem-spinlock.h> /* use a generic implementation */
#else
#include <asm/rwsem.h> /* use an arch-specific implementation */
#endif

David

2002-02-13 08:20:15

by David Howells

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores


Hi Brian,

Having examined your code again, I've got some comments on it:

I think the following would be more elegant:

static inline int __down_read_trylock(struct rw_semaphore *sem)
{
signed long old, new;

do {
old = (volatile signed long)sem->count;
if (old < RWSEM_UNLOCKED_VALUE)
return 0;
new = old + RWSEM_ACTIVE_READ_BIAS;
} while (cmpxchg(&sem->count, old, new) != old);

return 1;
}

I'm also not sure that the cast has any effect in the following excerpt from
the above:

old = (volatile signed long)sem->count;

What you may actually want is:

static inline int __down_read_trylock(struct rw_semaphore *sem)
{
volatile signed long *pcount;
signed long old, new;

pcount = &sem->count;
do {
old = *pcount;
if (old < RWSEM_UNLOCKED_VALUE)
return 0;
new = old + RWSEM_ACTIVE_READ_BIAS;
} while (cmpxchg(pcount, old, new) != old);

return 1;
}

Or maybe even:

static inline int __down_read_trylock(struct rw_semaphore *sem)
{
__s32 result, tmp;
__asm__ __volatile__(
"# beginning __down_read_trylock\n\t"
" movl %0,%1\n\t"
"1:\n\t"
" movl %1,%2\n\t"
" addl %3,%2\n\t"
" jle 2f\n\t"
LOCK_PREFIX " cmpxchgl %2,%0\n\t"
" jnz 1b\n\t"
"# ending __down_read_trylock\n\t"
: "+m"(sem->count), "=&a"(result), "+&r"(tmp)
: "i"(RWSEM_ACTIVE_READ_BIAS)
: "memory", "cc");
return result>=0 ? 1 : 0;
}

Using this inline assembly has three advantages over mixing lots of C into it:

(1) If the sign/zero flags are set as the result of calculating the new
value, then the old value was negative, and so the trylock has failed.

(2) CMPXCHG sets the zero flag on success and clears it on failure (something
that C can't test with the current inline asm setup).

(3) CMPXCHG fetches the current value of sem->count into EAX if it fails to
swap (so we don't need to get it again - hence why "1:" is where it is).

Point (3) could be incorporated into the C version. It may also be desirable
to include a rep;nop in whichever version is used, but I don't think it's
going to spin enough for that.


David

2002-02-14 00:36:58

by Brian J. Watson

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

David Howells wrote:
> I think the following would be more elegant:
>
> [snip]

I agree.


> I'm also not sure that the cast has any effect in the following excerpt from
> the above:
>
> old = (volatile signed long)sem->count;
>

You're right. I looked at the generated assembly, and the volatile cast
makes no difference.


> What you may actually want is:
> [snip]

Although you're right that a volatile pointer is the proper way to do
it, it turns out that a volatile declaration isn't necessary at all. The
cmpxchg() function is a memory barrier that forces the count to be
refetched the next time through the loop.


> Using this inline assembly has three advantages over mixing lots of C into it:
> [snip]

I'm not much of an assembly programmer, so implementing it this way
never crossed my mind. It looks much more efficient than the code
generated from the C version. A drawback is that it is not as easy to
port to other architectures, particularly those that already have a
cmpxchg() function.

It's up to you whether you prefer C or assembly. Let me know, and I'll
test that version and regenerate the patch.

Brian

2002-02-21 18:04:57

by Kendrick M. Smith

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores


On Tue Feb 12 2002, 17:45:00 EST, Brian Watson wrote:

> Marcelo-
>
> Attached is a patch that adds trylock routines for read/write
> semaphores. David Howells saw it last August and thought it was ready
> to be submitted then, but I became distracted and haven't taken the
> time to submit it until now. My motivation is that Christoph Hellwig
> says he needs it for JFS.

I just returned from vacation and saw this thread. I also need trylock()
routines for read-write semaphores for NFS version 4, but you're way ahead
of me: I hadn't even started to implement them yet, and have been working
around the deficiency. So I would really like to see some variant of this
patch go into the 2.5.x series eventually. Anything I can do to help out?

Cheers,
Kendrick Smith
Center for Information Technology and Integration, University of Michigan

2002-02-21 22:21:00

by Brian J. Watson

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

"Kendrick M. Smith" wrote:
> I just returned from vacation and saw this thread. I also need trylock()
> routines for read-write semaphores for NFS version 4, but you're way ahead
> of me: I hadn't even started to implement them yet, and have been working
> around the deficiency. So I would really like to see some variant of this
> patch go into the 2.5.x series eventually. Anything I can do to help out?


Can you test it on 2.5? It applies cleanly and builds with 2.5.3, but I
was having trouble booting the 2.5 kernel on RedHat 7.2. Not needing to
work on 2.5 just yet, I gave up rather quickly and just tested on 2.4.

Anyway, I can send you my test patch and a brief description of what I
looked for to make sure it was working on 2.4.

--
Brian Watson | "Now I don't know, but I been told it's
Linux Kernel Developer | hard to run with the weight of gold,
Open SSI Clustering Project | Other hand I heard it said, it's
Compaq Computer Corp | just as hard with the weight of lead."
Los Angeles, CA | -Robert Hunter, 1970

mailto:[email protected]
http://opensource.compaq.com/

2002-02-21 23:03:05

by Kendrick M. Smith

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores


On Thu, 21 Feb 2002, Brian J. Watson wrote:

> "Kendrick M. Smith" wrote:
> > I just returned from vacation and saw this thread. I also need trylock()
> > routines for read-write semaphores for NFS version 4, but you're way ahead
> > of me: I hadn't even started to implement them yet, and have been working
> > around the deficiency. So I would really like to see some variant of this
> > patch go into the 2.5.x series eventually. Anything I can do to help out?
>
>
> Can you test it on 2.5? It applies cleanly and builds with 2.5.3, but I
> was having trouble booting the 2.5 kernel on RedHat 7.2. Not needing to
> work on 2.5 just yet, I gave up rather quickly and just tested on 2.4.
>
> Anyway, I can send you my test patch and a brief description of what I
> looked for to make sure it was working on 2.4.

I have the patch from your original post and will give it a try with
2.5. In that post, you also mentioned having some sort of testsuite
which would place the semaphore under heavy contention, while also
testing basic semantics of the semaphore. If you send this along, I
will give it a try as well...

Cheers,
Kendrick

>
> --
> Brian Watson | "Now I don't know, but I been told it's
> Linux Kernel Developer | hard to run with the weight of gold,
> Open SSI Clustering Project | Other hand I heard it said, it's
> Compaq Computer Corp | just as hard with the weight of lead."
> Los Angeles, CA | -Robert Hunter, 1970
>
> mailto:[email protected]
> http://opensource.compaq.com/
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2002-02-21 23:53:38

by Brian J. Watson

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18-pre9, trylock for read/write semaphores

diff -Naur rwsem/include/asm-i386/rwsem.h rwsem-test/include/asm-i386/rwsem.h
--- rwsem/include/asm-i386/rwsem.h Mon Feb 18 18:46:18 2002
+++ rwsem-test/include/asm-i386/rwsem.h Mon Feb 18 18:18:02 2002
@@ -120,9 +120,30 @@
: "memory", "cc");
}

+void rst_debug_print(signed long ctr);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
+#if 0
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ signed long old, new, result;
+
+repeat:
+ old = (volatile signed long)sem->count;
+ if (old < RWSEM_UNLOCKED_VALUE)
+ return 0;
+ new = old + RWSEM_ACTIVE_READ_BIAS;
+ result = cmpxchg(&sem->count, old, new);
+ rst_debug_print(result);
+ if (result == old)
+ return 1;
+ else
+ goto repeat;
+}
+#endif
+
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
__s32 result, tmp;
@@ -140,6 +161,7 @@
: "+m"(sem->count), "=&a"(result), "+&r"(tmp)
: "i"(RWSEM_ACTIVE_READ_BIAS)
: "memory", "cc");
+ rst_debug_print(result);
return result>=0 ? 1 : 0;
}

@@ -181,6 +203,7 @@
signed long ret = cmpxchg(&sem->count,
RWSEM_UNLOCKED_VALUE,
RWSEM_ACTIVE_WRITE_BIAS);
+ rst_debug_print(ret);
if (ret == RWSEM_UNLOCKED_VALUE)
return 1;
return 0;
diff -Naur rwsem/include/linux/rwsem.h rwsem-test/include/linux/rwsem.h
--- rwsem/include/linux/rwsem.h Mon Feb 18 18:44:36 2002
+++ rwsem-test/include/linux/rwsem.h Mon Feb 18 18:18:07 2002
@@ -9,7 +9,7 @@

#include <linux/linkage.h>

-#define RWSEM_DEBUG 0
+#define RWSEM_DEBUG 1

#ifdef __KERNEL__

diff -Naur rwsem/include/rwsem_test.h rwsem-test/include/rwsem_test.h
--- rwsem/include/rwsem_test.h Wed Dec 31 16:00:00 1969
+++ rwsem-test/include/rwsem_test.h Mon Feb 18 15:36:44 2002
@@ -0,0 +1,100 @@
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/rwsem.h>
+#include <linux/spinlock.h>
+
+DECLARE_RWSEM(rst_lock);
+
+spinlock_t rst_ctrlock = SPIN_LOCK_UNLOCKED;
+unsigned int rst_ctr = 1000;
+unsigned int getctr(void)
+{
+ unsigned long ret;
+ spin_lock(&rst_ctrlock);
+ ret = ++rst_ctr;
+ spin_unlock(&rst_ctrlock);
+ return ret;
+}
+
+void rst_debug_print(signed long ctr)
+{
+ printk("[rst %d:%d] rwsem counter: %x\n", current->pid, getctr(), (unsigned int)ctr);
+}
+
+int
+rst_daemon(void *ignore)
+{
+ while (1) {
+ unsigned char sleep, style, ctr = 0;
+
+ get_random_bytes(&sleep, 1);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout((signed long)sleep);
+
+ for (; ctr < 20; ++ctr) {
+ get_random_bytes(&style, 1);
+
+ if (style > 191) {
+ down_write(&rst_lock);
+ printk("[rst %d:%d] down_write\n", current->pid, getctr());
+ } else if (style > 127) {
+ if (!down_write_trylock(&rst_lock)) {
+ printk("[rst %d:%d] down_write_trylock "
+ "failed\n",
+ current->pid, getctr());
+ break;
+ }
+ printk("[rst %d:%d] down_write_trylock "
+ "succeeded\n", current->pid, getctr());
+ } else if (style > 63) {
+ down_read(&rst_lock);
+ printk("[rst %d:%d] down_read\n", current->pid, getctr());
+ } else {
+ if (!down_read_trylock(&rst_lock)) {
+ printk("[rst %d:%d] down_read_trylock "
+ "failed\n",
+ current->pid, getctr());
+ break;
+ }
+ printk("[rst %d:%d] down_read_trylock "
+ "succeeded\n", current->pid, getctr());
+ }
+
+ if (style > 127)
+ printk("[rst %d:%d] writing... [1]\n",
+ current->pid, getctr());
+ else
+ printk("[rst %d:%d] reading... [1]\n",
+ current->pid, getctr());
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(5);
+
+ if (style > 127)
+ printk("[rst %d:%d] writing... [2]\n",
+ current->pid, getctr());
+ else
+ printk("[rst %d:%d] reading... [2]\n",
+ current->pid, getctr());
+
+ if (style > 127) {
+ printk("[rst %d:%d] up_write\n", current->pid, getctr());
+ up_write(&rst_lock);
+ } else {
+ printk("[rst %d:%d] up_read\n", current->pid, getctr());
+ up_read(&rst_lock);
+ }
+ }
+ }
+ return 0;
+}
+
+void
+rst_init(void)
+{
+ int ctr = 0;
+ rst_lock.debug = 1;
+ for (; ctr < 10; ++ctr)
+ kernel_thread(rst_daemon, NULL, 0);
+}
diff -Naur rwsem/init/main.c rwsem-test/init/main.c
--- rwsem/init/main.c Mon Feb 18 18:45:30 2002
+++ rwsem-test/init/main.c Mon Feb 18 14:11:23 2002
@@ -31,6 +31,8 @@
#include <asm/io.h>
#include <asm/bugs.h>

+#include <rwsem_test.h>
+
#if defined(CONFIG_ARCH_S390)
#include <asm/s390mach.h>
#include <asm/ccwcache.h>
@@ -829,6 +831,8 @@
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
+
+ rst_init();

if (execute_command)
execve(execute_command,argv_init,envp_init);


Attachments:
patch-test-2.4.18-rc1 (4.60 kB)