I've attached two slightly different bits of i386 assembly that achieve the
same end, but in slightly different ways. Can some one tell me why Case 1 is
faster than Case 2? Case 1 involves an extra CALL instruction.
* Case 1 has a little wrapper function that saves ECX and EDX before
calling rwsem_wake().
* Case 2 merges the contents of the wrapper with the caller.
Case 1 is what's generated by the rw-semaphore inline assembly code as of
2.4.4-pre5. Case 2 looks like it ought to be a faster version of the same
thing.
David
###############################################################################
#
# CASE 1: registers saved in the rwsem_wake register saving stub
#
.text
.align 16
#
# void test_up_read(struct rw_semaphore *sem)
# {
# up_read(sem);
# }
#
.globl test_up_read
.type test_up_read,@function
test_up_read:
movl 4(%esp), %eax
movl $-1, %edx
xadd %edx,(%eax)
js test_up_read_contention
test_up_read_done:
ret
#
# Register saving stub for rwsem_wake
#
.globl __rwsem_wake
__rwsem_wake:
pushl %edx
pushl %ecx
call rwsem_wake
popl %ecx
popl %edx
ret
#
# Contention handler stub for up_read
#
.section .text.lock,"ax"
test_up_read_contention:
decl %edx
testl $65535,%edx
jnz test_up_read_done
call __rwsem_wake
jmp test_up_read_done
###############################################################################
#
# CASE 2: registers saved in the contention handler stub
#
.text
.align 16
#
# void test_up_read(struct rw_semaphore *sem)
# {
# up_read(sem);
# }
#
.globl test_up_read
.type test_up_read,@function
test_up_read:
movl 4(%esp), %eax
movl $-1, %edx
xadd %edx,(%eax)
js test_up_read_contention
test_up_read_done:
ret
#
# Contention handler stub for up_read
#
.section .text.lock,"ax"
test_up_read_contention:
decl %edx
testl $65535,%edx
jnz test_up_read_done
pushl %edx
pushl %ecx
call __rwsem_wake
popl %ecx
popl %edx
jmp test_up_read_done