The first loop will waring once if prev_map is init, we can add a
boolean variable to do that. So those two loops can be combined to
improve performance.
Signed-off-by: Yajun Deng <[email protected]>
---
mm/early_ioremap.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 9bc12e526ed0..3076fb47c685 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -70,14 +70,15 @@ static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_setup(void)
{
+ bool init_prev_map = false;
int i;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
- if (WARN_ON(prev_map[i]))
- break;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!init_prev_map && WARN_ON(prev_map[i]))
+ init_prev_map = true;
- for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+ }
}
static int __init check_early_ioremap_leak(void)
--
2.25.1
Le 27/09/2022 à 09:52, Yajun Deng a écrit :
> The first loop will waring once if prev_map is init, we can add a
> boolean variable to do that. So those two loops can be combined to
> improve performance.
Do you have evidence of the improved performance ?
Looking at the generated code, I have the fealing that the performance
is reduced by the !init_prev_map that is checked at every lap.
Before the patch:
00000250 <early_ioremap_setup>:
250: 3d 20 00 00 lis r9,0
252: R_PPC_ADDR16_HA .init.data
254: 39 29 00 00 addi r9,r9,0
256: R_PPC_ADDR16_LO .init.data
258: 38 e0 00 10 li r7,16
25c: 39 09 00 04 addi r8,r9,4
260: 39 40 00 00 li r10,0
264: 7c e9 03 a6 mtctr r7
---- First loop : ----
268: 55 47 10 3a rlwinm r7,r10,2,0,29
26c: 7c e7 40 2e lwzx r7,r7,r8
270: 2c 07 00 00 cmpwi r7,0
274: 41 a2 00 08 beq 27c <early_ioremap_setup+0x2c>
278: 0f e0 00 00 twui r0,0
27c: 39 4a 00 01 addi r10,r10,1
280: 42 00 ff e8 bdnz 268 <early_ioremap_setup+0x18>
284: 39 00 00 10 li r8,16
288: 39 29 00 84 addi r9,r9,132
28c: 3d 40 ff b0 lis r10,-80
290: 7d 09 03 a6 mtctr r8
---- Second loop : ----
294: 95 49 00 04 stwu r10,4(r9)
298: 3d 4a 00 04 addis r10,r10,4
29c: 42 00 ff f8 bdnz 294 <early_ioremap_setup+0x44>
2a0: 4e 80 00 20 blr
After the patch:
00000250 <early_ioremap_setup>:
250: 3d 20 00 00 lis r9,0
252: R_PPC_ADDR16_HA .init.data
254: 39 29 00 00 addi r9,r9,0
256: R_PPC_ADDR16_LO .init.data
258: 39 00 00 10 li r8,16
25c: 38 c9 00 04 addi r6,r9,4
260: 39 40 00 00 li r10,0
264: 39 29 00 88 addi r9,r9,136
268: 38 e0 00 00 li r7,0
26c: 7d 09 03 a6 mtctr r8
--- Loop ---
270: 70 e8 00 01 andi. r8,r7,1
274: 40 82 00 18 bne 28c <early_ioremap_setup+0x3c>
278: 7d 0a 30 2e lwzx r8,r10,r6
27c: 2c 08 00 00 cmpwi r8,0
280: 41 a2 00 0c beq 28c <early_ioremap_setup+0x3c>
284: 0f e0 00 00 twui r0,0
288: 38 e0 00 01 li r7,1
28c: 55 48 80 1e rlwinm r8,r10,16,0,15
290: 3d 08 ff b0 addis r8,r8,-80
294: 7d 0a 49 2e stwx r8,r10,r9
298: 39 4a 00 04 addi r10,r10,4
29c: 42 00 ff d4 bdnz 270 <early_ioremap_setup+0x20>
2a0: 4e 80 00 20 blr
Maybe using WARN_ON_ONCE() could be the solution. But looking at the
code generated if using it, I still have the feeling that GCC has a
better code with the original code.
>
> Signed-off-by: Yajun Deng <[email protected]>
> ---
> mm/early_ioremap.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
> index 9bc12e526ed0..3076fb47c685 100644
> --- a/mm/early_ioremap.c
> +++ b/mm/early_ioremap.c
> @@ -70,14 +70,15 @@ static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
>
> void __init early_ioremap_setup(void)
> {
> + bool init_prev_map = false;
> int i;
>
> - for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
> - if (WARN_ON(prev_map[i]))
> - break;
> + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
> + if (!init_prev_map && WARN_ON(prev_map[i]))
> + init_prev_map = true;
>
> - for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
> slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
> + }
> }
>
> static int __init check_early_ioremap_leak(void)
September 27, 2022 4:47 PM, "Christophe Leroy" <[email protected]> wrote:
> Le 27/09/2022 à 09:52, Yajun Deng a écrit :
>
>> The first loop will waring once if prev_map is init, we can add a
>> boolean variable to do that. So those two loops can be combined to
>> improve performance.
>
> Do you have evidence of the improved performance ?
>
> Looking at the generated code, I have the fealing that the performance
> is reduced by the !init_prev_map that is checked at every lap.
>
> Before the patch:
>
> 00000250 <early_ioremap_setup>:
> 250: 3d 20 00 00 lis r9,0
> 252: R_PPC_ADDR16_HA .init.data
> 254: 39 29 00 00 addi r9,r9,0
> 256: R_PPC_ADDR16_LO .init.data
> 258: 38 e0 00 10 li r7,16
> 25c: 39 09 00 04 addi r8,r9,4
> 260: 39 40 00 00 li r10,0
> 264: 7c e9 03 a6 mtctr r7
>
> ---- First loop : ----
> 268: 55 47 10 3a rlwinm r7,r10,2,0,29
> 26c: 7c e7 40 2e lwzx r7,r7,r8
> 270: 2c 07 00 00 cmpwi r7,0
> 274: 41 a2 00 08 beq 27c <early_ioremap_setup+0x2c>
> 278: 0f e0 00 00 twui r0,0
> 27c: 39 4a 00 01 addi r10,r10,1
> 280: 42 00 ff e8 bdnz 268 <early_ioremap_setup+0x18>
>
> 284: 39 00 00 10 li r8,16
> 288: 39 29 00 84 addi r9,r9,132
> 28c: 3d 40 ff b0 lis r10,-80
> 290: 7d 09 03 a6 mtctr r8
>
> ---- Second loop : ----
> 294: 95 49 00 04 stwu r10,4(r9)
> 298: 3d 4a 00 04 addis r10,r10,4
> 29c: 42 00 ff f8 bdnz 294 <early_ioremap_setup+0x44>
>
> 2a0: 4e 80 00 20 blr
>
> After the patch:
>
> 00000250 <early_ioremap_setup>:
> 250: 3d 20 00 00 lis r9,0
> 252: R_PPC_ADDR16_HA .init.data
> 254: 39 29 00 00 addi r9,r9,0
> 256: R_PPC_ADDR16_LO .init.data
> 258: 39 00 00 10 li r8,16
> 25c: 38 c9 00 04 addi r6,r9,4
> 260: 39 40 00 00 li r10,0
> 264: 39 29 00 88 addi r9,r9,136
> 268: 38 e0 00 00 li r7,0
> 26c: 7d 09 03 a6 mtctr r8
>
> --- Loop ---
> 270: 70 e8 00 01 andi. r8,r7,1
> 274: 40 82 00 18 bne 28c <early_ioremap_setup+0x3c>
> 278: 7d 0a 30 2e lwzx r8,r10,r6
> 27c: 2c 08 00 00 cmpwi r8,0
> 280: 41 a2 00 0c beq 28c <early_ioremap_setup+0x3c>
> 284: 0f e0 00 00 twui r0,0
> 288: 38 e0 00 01 li r7,1
> 28c: 55 48 80 1e rlwinm r8,r10,16,0,15
> 290: 3d 08 ff b0 addis r8,r8,-80
> 294: 7d 0a 49 2e stwx r8,r10,r9
> 298: 39 4a 00 04 addi r10,r10,4
> 29c: 42 00 ff d4 bdnz 270 <early_ioremap_setup+0x20>
>
> 2a0: 4e 80 00 20 blr
>
Yes, I do it.
test1.c:
<===================================================================>
#include <stdio.h>
#include <stdlib.h>
#define FIX_BTMAPS_SLOTS 8
static void *prev_map[FIX_BTMAPS_SLOTS];
static unsigned long slot_virt[FIX_BTMAPS_SLOTS];
int main(void)
{
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (prev_map[i]) {
printf("warning!\n");
break;
}
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = FIX_BTMAPS_SLOTS * i;
return 0;
}
<===================================================================>
test2.c
<===================================================================>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#define FIX_BTMAPS_SLOTS 8
static void *prev_map[FIX_BTMAPS_SLOTS];
static unsigned long slot_virt[FIX_BTMAPS_SLOTS];
int main(void)
{
bool init_prev_map = false;
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (!init_prev_map && prev_map[i])
init_prev_map = true;
slot_virt[i] = FIX_BTMAPS_SLOTS * i;
}
return 0;
}
<===================================================================>
$ gcc test1.c -o test1 && gcc test2.c -o test2
<===================================================================>
$ perf stat ./test1
Performance counter stats for './test1':
0.17 msec task-clock:u # 0.444 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
43 page-faults:u # 0.246 M/sec
154,813 cycles:u # 0.885 GHz
106,234 instructions:u # 0.69 insn per cycle
21,510 branches:u # 122.990 M/sec
1,409 branch-misses:u # 6.55% of all branches
0.000393857 seconds time elapsed
0.000420000 seconds user
0.000000000 seconds sys
$ perf stat ./test2
Performance counter stats for './test2':
0.17 msec task-clock:u # 0.439 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
43 page-faults:u # 0.249 M/sec
152,744 cycles:u # 0.884 GHz
105,282 instructions:u # 0.69 insn per cycle
21,342 branches:u # 123.545 M/sec
1,334 branch-misses:u # 6.25% of all branches
0.000393084 seconds time elapsed
0.000412000 seconds user
0.000000000 seconds sys
<===================================================================>
It seems almost the same. If we change FIX_BTMAPS_SLOTS from 8 to 80000,
It takes less time after the patch.
<===================================================================>
$ perf stat ./test1
Performance counter stats for './test1':
0.73 msec task-clock:u # 0.768 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355 page-faults:u # 0.484 M/sec
1,532,520 cycles:u # 2.087 GHz
1,786,378 instructions:u # 1.17 insn per cycle
261,798 branches:u # 356.594 M/sec
1,580 branch-misses:u # 0.60% of all branches
0.000956129 seconds time elapsed
0.000000000 seconds user
0.000981000 seconds sys
$ perf stat ./test2
Performance counter stats for './test2':
0.60 msec task-clock:u # 0.732 CPUs utilized
0 context-switches:u # 0.000 K/sec
0 cpu-migrations:u # 0.000 K/sec
355 page-faults:u # 0.589 M/sec
1,066,851 cycles:u # 1.769 GHz
1,865,418 instructions:u # 1.75 insn per cycle
261,630 branches:u # 433.808 M/sec
1,369 branch-misses:u # 0.52% of all branches
0.000824064 seconds time elapsed
0.000846000 seconds user
0.000000000 seconds sys
> Maybe using WARN_ON_ONCE() could be the solution. But looking at the
> code generated if using it, I still have the feeling that GCC has a
> better code with the original code.
>
>> Signed-off-by: Yajun Deng <[email protected]>
>> ---
>> mm/early_ioremap.c | 9 +++++----
>> 1 file changed, 5 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
>> index 9bc12e526ed0..3076fb47c685 100644
>> --- a/mm/early_ioremap.c
>> +++ b/mm/early_ioremap.c
>> @@ -70,14 +70,15 @@ static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
>>
>> void __init early_ioremap_setup(void)
>> {
>> + bool init_prev_map = false;
>> int i;
>>
>> - for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
>> - if (WARN_ON(prev_map[i]))
>> - break;
>> + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
>> + if (!init_prev_map && WARN_ON(prev_map[i]))
>> + init_prev_map = true;
>>
>> - for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
>> slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
>> + }
>> }
>>
>> static int __init check_early_ioremap_leak(void)