Although the parameter 'cmd' is always passed by a local array variable,
and only this function modifies it, the compiler does not know this. Every
time the 'cmd' variable is updated, a memory write operation is generated.
This generates many useless instruction operations.
To guide the compiler for proper optimization, 'cmd' is defined as a local
array variable, and copied to the output parameter at a time when the
function is returned.
The optimization effect can be viewed by running the "size arm-smmu-v3.o"
command.
Before:
text data bss dec hex
28246 1332 56 29634 73c2
After:
text data bss dec hex
28134 1332 56 29522 7352
For example:
cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
case CMDQ_OP_TLBI_EL2_VA:
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
Before:
Each "cmd[0] |=" or "cmd[1] |=" operation generates a "str" instruction,
sum = 8.
ldrb w4, [x1, #8] //w4 = ent->tlbi.num
ubfiz x4, x4, #12, #5
mov w0, #0x0
orr x4, x4, x3
str x4, [x2]
autiasp
ldrb w3, [x1, #9] //w3 = ent->tlbi.scale
ubfiz x3, x3, #20, #5
orr x3, x3, x4
str x3, [x2]
ldrh w4, [x1, #10] //w4 = ent->tlbi.asid
orr x3, x3, x4, lsl #48
str x3, [x2]
ldrb w3, [x1, #14] //w3 = ent->tlbi.leaf
str x3, [x2, #8]
ldrb w4, [x1, #15] //w4 = ent->tlbi.ttl
ubfiz x4, x4, #8, #2
orr x4, x4, x3
str x4, [x2, #8]
ldrb w3, [x1, #16] //ent->tlbi.tg
ubfiz x3, x3, #10, #2
orr x3, x3, x4
str x3, [x2, #8]
ldr x1, [x1, #24] //ent->tlbi.addr
and x1, x1, #0xfffffffffffff000
orr x1, x1, x3
str x1, [x2, #8]
ret
After:
All "cmd[0] |=" and "cmd[1] |=" operations generate a "stp" instruction,
sum = 1.
3e8:
mov w0, #0x0
autiasp
stp x2, x1, [x3]
ret
bti j
3fc:
ldrb w5, [x1, #8] //w5 = ent->tlbi.num
mov x2, #0x22 //x2 = ent->opcode = CMDQ_0_OP
ldrb w6, [x1, #9] //w6 = ent->tlbi.scale
ubfiz x5, x5, #12, #5
ldrb w0, [x1, #16] //w0 = ent->tlbi.tg
orr x5, x5, x2
ldrb w7, [x1, #15] //w7 = ent->tlbi.ttl
ldr x4, [x1, #24] //x4 = ent->tlbi.addr
ubfiz x0, x0, #10, #2
ldrh w2, [x1, #10] //w2 = ent->tlbi.asid
ubfiz x6, x6, #20, #5
ldrb w8, [x1, #14] //w8 = ent->tlbi.leaf
and x4, x4, #0xfffffffffffff000
ubfiz x1, x7, #8, #2
orr x1, x0, x1
orr x2, x6, x2, lsl #48
orr x0, x4, x8
orr x2, x2, x5
orr x1, x1, x0
b 3e8
Signed-off-by: Zhen Lei <[email protected]>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f5848b351b19359..e55dfc14cac6005 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -234,10 +234,12 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
}
/* High-level queue accessors */
-static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
+static int arm_smmu_cmdq_build_cmd(u64 *out_cmd, struct arm_smmu_cmdq_ent *ent)
{
- memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
- cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+ int i;
+ u64 cmd[CMDQ_ENT_DWORDS] = {0};
+
+ cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
switch (ent->opcode) {
case CMDQ_OP_TLBI_EL2_ALL:
@@ -332,6 +334,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
return -ENOENT;
}
+ for (i = 0; i < CMDQ_ENT_DWORDS; i++)
+ out_cmd[i] = cmd[i];
+
return 0;
}
--
2.25.1
On 07/12/2021 09:41, Zhen Lei via iommu wrote:
> Although the parameter 'cmd' is always passed by a local array variable,
> and only this function modifies it, the compiler does not know this. Every
> time the 'cmd' variable is updated, a memory write operation is generated.
> This generates many useless instruction operations.
I'd hardly call them useless. More like inefficient or sub-optimum.
>
> To guide the compiler for proper optimization, 'cmd' is defined as a local
> array variable, and copied to the output parameter at a time when the
> function is returned.
>
> The optimization effect can be viewed by running the "size arm-smmu-v3.o"
> command.
>
> Before:
> text data bss dec hex
> 28246 1332 56 29634 73c2
>
> After:
> text data bss dec hex
> 28134 1332 56 29522 7352
>
> For example:
> cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
> case CMDQ_OP_TLBI_EL2_VA:
> cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
> cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
> cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
> cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
>
> Before:
> Each "cmd[0] |=" or "cmd[1] |=" operation generates a "str" instruction,
> sum = 8.
>
> ldrb w4, [x1, #8] //w4 = ent->tlbi.num
> ubfiz x4, x4, #12, #5
> mov w0, #0x0
> orr x4, x4, x3
> str x4, [x2]
> autiasp
> ldrb w3, [x1, #9] //w3 = ent->tlbi.scale
> ubfiz x3, x3, #20, #5
> orr x3, x3, x4
> str x3, [x2]
> ldrh w4, [x1, #10] //w4 = ent->tlbi.asid
> orr x3, x3, x4, lsl #48
> str x3, [x2]
> ldrb w3, [x1, #14] //w3 = ent->tlbi.leaf
> str x3, [x2, #8]
> ldrb w4, [x1, #15] //w4 = ent->tlbi.ttl
> ubfiz x4, x4, #8, #2
> orr x4, x4, x3
> str x4, [x2, #8]
> ldrb w3, [x1, #16] //ent->tlbi.tg
> ubfiz x3, x3, #10, #2
> orr x3, x3, x4
> str x3, [x2, #8]
> ldr x1, [x1, #24] //ent->tlbi.addr
> and x1, x1, #0xfffffffffffff000
> orr x1, x1, x3
> str x1, [x2, #8]
> ret
>
> After:
> All "cmd[0] |=" and "cmd[1] |=" operations generate a "stp" instruction,
> sum = 1.
>
> 3e8:
> mov w0, #0x0
> autiasp
> stp x2, x1, [x3]
> ret
> bti j
> 3fc:
> ldrb w5, [x1, #8] //w5 = ent->tlbi.num
> mov x2, #0x22 //x2 = ent->opcode = CMDQ_0_OP
> ldrb w6, [x1, #9] //w6 = ent->tlbi.scale
> ubfiz x5, x5, #12, #5
> ldrb w0, [x1, #16] //w0 = ent->tlbi.tg
> orr x5, x5, x2
> ldrb w7, [x1, #15] //w7 = ent->tlbi.ttl
> ldr x4, [x1, #24] //x4 = ent->tlbi.addr
> ubfiz x0, x0, #10, #2
> ldrh w2, [x1, #10] //w2 = ent->tlbi.asid
> ubfiz x6, x6, #20, #5
> ldrb w8, [x1, #14] //w8 = ent->tlbi.leaf
> and x4, x4, #0xfffffffffffff000
> ubfiz x1, x7, #8, #2
> orr x1, x0, x1
> orr x2, x6, x2, lsl #48
> orr x0, x4, x8
> orr x2, x2, x5
> orr x1, x1, x0
> b 3e8
>
> Signed-off-by: Zhen Lei <[email protected]>
> ---
> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++---
> 1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index f5848b351b19359..e55dfc14cac6005 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -234,10 +234,12 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
> }
>
> /* High-level queue accessors */
> -static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> +static int arm_smmu_cmdq_build_cmd(u64 *out_cmd, struct arm_smmu_cmdq_ent *ent)
> {
> - memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
> - cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
> + int i;
> + u64 cmd[CMDQ_ENT_DWORDS] = {0};
I thought that just {} was preferred. Or could have:
u64 cmd[CMDQ_ENT_DWORDS] = {FIELD_PREP(CMDQ_0_OP, ent->opcode), };
to be more concise
> +
> + cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
>
> switch (ent->opcode) {
> case CMDQ_OP_TLBI_EL2_ALL:
> @@ -332,6 +334,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> return -ENOENT;
> }
>
> + for (i = 0; i < CMDQ_ENT_DWORDS; i++)
> + out_cmd[i] = cmd[i];
how about memcpy() instead?
> +
> return 0;
> }
Did you notice any performance change with this change?
Thanks,
John
On 2021/12/8 0:17, John Garry wrote:
>
>> +
>> return 0;
>> }
>
> Did you notice any performance change with this change?
Hi John:
Thanks for the tip. I wrote a test case today, and I found that the
performance did not go up but down. It's so weird. So I decided not to
change it, because it's also poorly readable. So I plan to make only
the following modifications:
@@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
{
memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
- cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
+ cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
switch (ent->opcode) {
case CMDQ_OP_TLBI_EL2_ALL:
This prevents the compiler from generating the following two inefficient
instructions:
394: f9400002 ldr x2, [x0] //x2 = cmd[0]
398: aa020062 orr x2, x3, x2 //x3 = FIELD_PREP(CMDQ_0_OP, ent->opcode)
Maybe it's not worth changing because I've only seen a 0.x nanosecond reduction
in performance. But one thing is, it only comes with benefits, no side effects.
>
> Thanks,
> John
>> Did you notice any performance change with this change?
>
> Hi John:
> Thanks for the tip. I wrote a test case today, and I found that the
> performance did not go up but down.
I very quickly tested on a DMA mapping benchmark very similar to the
kernel DMA benchmark module - I got mixed results. For fewer CPUs (<8),
a small improvement, like 0.7%. For more CPUs, a dis-improvement -
that's surprising, I did expect just no change as any improvement would
get dwarfed from the slower unmap rates for more CPUs. I can check this
more tomorrow.
> It's so weird. So I decided not to
> change it, because it's also poorly readable. So I plan to make only
> the following modifications:
> @@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue *q, u64 *ent)
> static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
> {
> memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
> - cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
> + cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
>
> switch (ent->opcode) {
> case CMDQ_OP_TLBI_EL2_ALL:
>
> This prevents the compiler from generating the following two inefficient
> instructions:
> 394: f9400002 ldr x2, [x0] //x2 = cmd[0]
> 398: aa020062 orr x2, x3, x2 //x3 = FIELD_PREP(CMDQ_0_OP, ent->opcode)
>
> Maybe it's not worth changing because I've only seen a 0.x nanosecond reduction
> in performance. But one thing is, it only comes with benefits, no side effects.
>
I just think that with the original code that cmd[] is on the stack and
cached, so if we have write-back attribute (which I think we do) then
there would not necessarily a write to external memory per write to cmd[].
So, apart from this approach, I think that if we can just reduce the
instructions through other efficiencies in the function then that would
be good.
Thanks,
John
On 2021-12-08 18:17, John Garry wrote:
>>> Did you notice any performance change with this change?
>>
>> Hi John:
>> Thanks for the tip. I wrote a test case today, and I found that the
>> performance did not go up but down.
>
> I very quickly tested on a DMA mapping benchmark very similar to the
> kernel DMA benchmark module - I got mixed results. For fewer CPUs (<8),
> a small improvement, like 0.7%. For more CPUs, a dis-improvement -
> that's surprising, I did expect just no change as any improvement would
> get dwarfed from the slower unmap rates for more CPUs. I can check this
> more tomorrow.
>
>> It's so weird. So I decided not to
>> change it, because it's also poorly readable. So I plan to make only
>> the following modifications:
>> @@ -237,7 +237,7 @@ static int queue_remove_raw(struct arm_smmu_queue
>> *q, u64 *ent)
>> static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct
>> arm_smmu_cmdq_ent *ent)
>> {
>> memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
>> - cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
>> + cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
>>
>> switch (ent->opcode) {
>> case CMDQ_OP_TLBI_EL2_ALL:
>>
>> This prevents the compiler from generating the following two inefficient
>> instructions:
>> 394: f9400002 ldr x2, [x0] //x2 = cmd[0]
>> 398: aa020062 orr x2, x3, x2 //x3 =
>> FIELD_PREP(CMDQ_0_OP, ent->opcode)
>>
>> Maybe it's not worth changing because I've only seen a 0.x nanosecond
>> reduction
>> in performance. But one thing is, it only comes with benefits, no side
>> effects.
>>
>
> I just think that with the original code that cmd[] is on the stack and
> cached, so if we have write-back attribute (which I think we do) then
> there would not necessarily a write to external memory per write to cmd[].
>
> So, apart from this approach, I think that if we can just reduce the
> instructions through other efficiencies in the function then that would
> be good.
Not sure if it's still true, but FWIW last time the best result actually
came from doing the ridiculously counter-intuitive:
https://lore.kernel.org/linux-iommu/141de3c3278e280712d16d9ac9ab305c3b80a810.1534344167.git.robin.murphy@arm.com/
Robin.