When r3 is not modified, reload it from regs->orig_r3 to free
volatile registers. This avoids a stack frame for the likely part
of syscall_call_exception()
Before : 353 cycles on null_syscall
After : 347 cycles on null_syscall
Before the patch:
c000b4d4 <system_call_exception>:
c000b4d4: 7c 08 02 a6 mflr r0
c000b4d8: 94 21 ff e0 stwu r1,-32(r1)
c000b4dc: 93 e1 00 1c stw r31,28(r1)
c000b4e0: 90 01 00 24 stw r0,36(r1)
c000b4e4: 90 6a 00 88 stw r3,136(r10)
c000b4e8: 81 6a 00 84 lwz r11,132(r10)
c000b4ec: 69 6b 00 02 xori r11,r11,2
c000b4f0: 55 6b ff fe rlwinm r11,r11,31,31,31
c000b4f4: 0f 0b 00 00 twnei r11,0
c000b4f8: 81 6a 00 a0 lwz r11,160(r10)
c000b4fc: 55 6b 07 fe clrlwi r11,r11,31
c000b500: 0f 0b 00 00 twnei r11,0
c000b504: 7c 0c 42 e6 mftb r0
c000b508: 83 e2 00 08 lwz r31,8(r2)
c000b50c: 81 82 00 28 lwz r12,40(r2)
c000b510: 90 02 00 24 stw r0,36(r2)
c000b514: 7d 8c f8 50 subf r12,r12,r31
c000b518: 7c 0c 02 14 add r0,r12,r0
c000b51c: 90 02 00 08 stw r0,8(r2)
c000b520: 7c 10 13 a6 mtspr 80,r0
c000b524: 81 62 00 70 lwz r11,112(r2)
c000b528: 71 60 86 91 andi. r0,r11,34449
c000b52c: 40 82 00 34 bne c000b560 <system_call_exception+0x8c>
c000b530: 2b 89 01 b6 cmplwi cr7,r9,438
c000b534: 41 9d 00 64 bgt cr7,c000b598 <system_call_exception+0xc4>
c000b538: 3d 40 c0 5c lis r10,-16292
c000b53c: 55 29 10 3a rlwinm r9,r9,2,0,29
c000b540: 39 4a 41 e8 addi r10,r10,16872
c000b544: 80 01 00 24 lwz r0,36(r1)
c000b548: 7d 2a 48 2e lwzx r9,r10,r9
c000b54c: 7c 08 03 a6 mtlr r0
c000b550: 7d 29 03 a6 mtctr r9
c000b554: 83 e1 00 1c lwz r31,28(r1)
c000b558: 38 21 00 20 addi r1,r1,32
c000b55c: 4e 80 04 20 bctr
After the patch:
c000b4d4 <system_call_exception>:
c000b4d4: 81 6a 00 84 lwz r11,132(r10)
c000b4d8: 90 6a 00 88 stw r3,136(r10)
c000b4dc: 69 6b 00 02 xori r11,r11,2
c000b4e0: 55 6b ff fe rlwinm r11,r11,31,31,31
c000b4e4: 0f 0b 00 00 twnei r11,0
c000b4e8: 80 6a 00 a0 lwz r3,160(r10)
c000b4ec: 54 63 07 fe clrlwi r3,r3,31
c000b4f0: 0f 03 00 00 twnei r3,0
c000b4f4: 7d 6c 42 e6 mftb r11
c000b4f8: 81 82 00 08 lwz r12,8(r2)
c000b4fc: 80 02 00 28 lwz r0,40(r2)
c000b500: 91 62 00 24 stw r11,36(r2)
c000b504: 7c 00 60 50 subf r0,r0,r12
c000b508: 7d 60 5a 14 add r11,r0,r11
c000b50c: 91 62 00 08 stw r11,8(r2)
c000b510: 7c 10 13 a6 mtspr 80,r0
c000b514: 80 62 00 70 lwz r3,112(r2)
c000b518: 70 6b 86 91 andi. r11,r3,34449
c000b51c: 40 82 00 28 bne c000b544 <system_call_exception+0x70>
c000b520: 2b 89 01 b6 cmplwi cr7,r9,438
c000b524: 41 9d 00 84 bgt cr7,c000b5a8 <system_call_exception+0xd4>
c000b528: 80 6a 00 88 lwz r3,136(r10)
c000b52c: 3d 40 c0 5c lis r10,-16292
c000b530: 55 29 10 3a rlwinm r9,r9,2,0,29
c000b534: 39 4a 41 e4 addi r10,r10,16868
c000b538: 7d 2a 48 2e lwzx r9,r10,r9
c000b53c: 7d 29 03 a6 mtctr r9
c000b540: 4e 80 04 20 bctr
Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/kernel/syscall.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 69d75fc4a5eb..630c423e089a 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -91,6 +91,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
} else if (unlikely(r0 >= NR_syscalls)) {
return -ENOSYS;
+ } else {
+ r3 = regs->orig_gpr3;
}
/* May be faster to do array_index_nospec? */
--
2.25.0
Christophe Leroy's on April 6, 2020 3:44 am:
> When r3 is not modified, reload it from regs->orig_r3 to free
> volatile registers. This avoids a stack frame for the likely part
> of syscall_call_exception()
>
> Before : 353 cycles on null_syscall
> After : 347 cycles on null_syscall
>
> Before the patch:
>
> c000b4d4 <system_call_exception>:
> c000b4d4: 7c 08 02 a6 mflr r0
> c000b4d8: 94 21 ff e0 stwu r1,-32(r1)
> c000b4dc: 93 e1 00 1c stw r31,28(r1)
> c000b4e0: 90 01 00 24 stw r0,36(r1)
> c000b4e4: 90 6a 00 88 stw r3,136(r10)
> c000b4e8: 81 6a 00 84 lwz r11,132(r10)
> c000b4ec: 69 6b 00 02 xori r11,r11,2
> c000b4f0: 55 6b ff fe rlwinm r11,r11,31,31,31
> c000b4f4: 0f 0b 00 00 twnei r11,0
> c000b4f8: 81 6a 00 a0 lwz r11,160(r10)
> c000b4fc: 55 6b 07 fe clrlwi r11,r11,31
> c000b500: 0f 0b 00 00 twnei r11,0
> c000b504: 7c 0c 42 e6 mftb r0
> c000b508: 83 e2 00 08 lwz r31,8(r2)
> c000b50c: 81 82 00 28 lwz r12,40(r2)
> c000b510: 90 02 00 24 stw r0,36(r2)
> c000b514: 7d 8c f8 50 subf r12,r12,r31
> c000b518: 7c 0c 02 14 add r0,r12,r0
> c000b51c: 90 02 00 08 stw r0,8(r2)
> c000b520: 7c 10 13 a6 mtspr 80,r0
> c000b524: 81 62 00 70 lwz r11,112(r2)
> c000b528: 71 60 86 91 andi. r0,r11,34449
> c000b52c: 40 82 00 34 bne c000b560 <system_call_exception+0x8c>
> c000b530: 2b 89 01 b6 cmplwi cr7,r9,438
> c000b534: 41 9d 00 64 bgt cr7,c000b598 <system_call_exception+0xc4>
> c000b538: 3d 40 c0 5c lis r10,-16292
> c000b53c: 55 29 10 3a rlwinm r9,r9,2,0,29
> c000b540: 39 4a 41 e8 addi r10,r10,16872
> c000b544: 80 01 00 24 lwz r0,36(r1)
> c000b548: 7d 2a 48 2e lwzx r9,r10,r9
> c000b54c: 7c 08 03 a6 mtlr r0
> c000b550: 7d 29 03 a6 mtctr r9
> c000b554: 83 e1 00 1c lwz r31,28(r1)
> c000b558: 38 21 00 20 addi r1,r1,32
> c000b55c: 4e 80 04 20 bctr
>
> After the patch:
>
> c000b4d4 <system_call_exception>:
> c000b4d4: 81 6a 00 84 lwz r11,132(r10)
> c000b4d8: 90 6a 00 88 stw r3,136(r10)
> c000b4dc: 69 6b 00 02 xori r11,r11,2
> c000b4e0: 55 6b ff fe rlwinm r11,r11,31,31,31
> c000b4e4: 0f 0b 00 00 twnei r11,0
> c000b4e8: 80 6a 00 a0 lwz r3,160(r10)
> c000b4ec: 54 63 07 fe clrlwi r3,r3,31
> c000b4f0: 0f 03 00 00 twnei r3,0
> c000b4f4: 7d 6c 42 e6 mftb r11
> c000b4f8: 81 82 00 08 lwz r12,8(r2)
> c000b4fc: 80 02 00 28 lwz r0,40(r2)
> c000b500: 91 62 00 24 stw r11,36(r2)
> c000b504: 7c 00 60 50 subf r0,r0,r12
> c000b508: 7d 60 5a 14 add r11,r0,r11
> c000b50c: 91 62 00 08 stw r11,8(r2)
> c000b510: 7c 10 13 a6 mtspr 80,r0
> c000b514: 80 62 00 70 lwz r3,112(r2)
> c000b518: 70 6b 86 91 andi. r11,r3,34449
> c000b51c: 40 82 00 28 bne c000b544 <system_call_exception+0x70>
> c000b520: 2b 89 01 b6 cmplwi cr7,r9,438
> c000b524: 41 9d 00 84 bgt cr7,c000b5a8 <system_call_exception+0xd4>
> c000b528: 80 6a 00 88 lwz r3,136(r10)
> c000b52c: 3d 40 c0 5c lis r10,-16292
> c000b530: 55 29 10 3a rlwinm r9,r9,2,0,29
> c000b534: 39 4a 41 e4 addi r10,r10,16868
> c000b538: 7d 2a 48 2e lwzx r9,r10,r9
> c000b53c: 7d 29 03 a6 mtctr r9
> c000b540: 4e 80 04 20 bctr
>
> Signed-off-by: Christophe Leroy <[email protected]>
> ---
> arch/powerpc/kernel/syscall.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index 69d75fc4a5eb..630c423e089a 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -91,6 +91,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
>
> } else if (unlikely(r0 >= NR_syscalls)) {
> return -ENOSYS;
> + } else {
> + r3 = regs->orig_gpr3;
> }
So this just gives enough volatiles to avoid spilling to stack? I wonder
about other various options here if they would cause a spill anyway.
Interesting optimisation, it would definitely need a comment. Would be
nice if we had a way to tell the compiler that a local can be reloaded
from a particular address.
Thanks,
Nick
Le 06/04/2020 à 03:29, Nicholas Piggin a écrit :
> Christophe Leroy's on April 6, 2020 3:44 am:
>> When r3 is not modified, reload it from regs->orig_r3 to free
>> volatile registers. This avoids a stack frame for the likely part
>> of syscall_call_exception()
>>
>> Before : 353 cycles on null_syscall
>> After : 347 cycles on null_syscall
>>
[...]
>>
>> Signed-off-by: Christophe Leroy <[email protected]>
>> ---
>> arch/powerpc/kernel/syscall.c | 2 ++
>> 1 file changed, 2 insertions(+)
>>
>> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
>> index 69d75fc4a5eb..630c423e089a 100644
>> --- a/arch/powerpc/kernel/syscall.c
>> +++ b/arch/powerpc/kernel/syscall.c
>> @@ -91,6 +91,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
>>
>> } else if (unlikely(r0 >= NR_syscalls)) {
>> return -ENOSYS;
>> + } else {
>> + r3 = regs->orig_gpr3;
>> }
>
> So this just gives enough volatiles to avoid spilling to stack? I wonder
> about other various options here if they would cause a spill anyway.
>
> Interesting optimisation, it would definitely need a comment. Would be
> nice if we had a way to tell the compiler that a local can be reloaded
> from a particular address.
Ok, comment added.
Christophe