2014-06-16 21:07:31

by Michal Nazarewicz

[permalink] [raw]
Subject: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

It appears that gcc is better at optimising a double call to min
and max rather than open coded min3 and max3. This can be observed
here:

$ cat min-max.c
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define min3(x, y, z) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
typeof(z) _min3 = (z); \
(void) (&_min1 == &_min2); \
(void) (&_min1 == &_min3); \
_min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
(_min2 < _min3 ? _min2 : _min3); })

int fmin3(int x, int y, int z) { return min3(x, y, z); }
int fmin2(int x, int y, int z) { return min(min(x, y), z); }

$ gcc -O2 -o min-max.s -S min-max.c; cat min-max.s
.file "min-max.c"
.text
.p2align 4,,15
.globl fmin3
.type fmin3, @function
fmin3:
.LFB0:
.cfi_startproc
cmpl %esi, %edi
jl .L5
cmpl %esi, %edx
movl %esi, %eax
cmovle %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L5:
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE0:
.size fmin3, .-fmin3
.p2align 4,,15
.globl fmin2
.type fmin2, @function
fmin2:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovle %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE1:
.size fmin2, .-fmin2
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits

fmin3 function, which uses open-coded min3 macro, is compiled into
total of ten instructions including a conditional branch, whereas fmin2
function, which uses two calls to min2 macro, is compiled into six
instructions with no branches.

Similarly, open-coded clamp produces the same code as clamp using min
and max macros, but the latter is much shorter:

$ cat clamp.c
#define clamp(val, min, max) ({ \
typeof(val) __val = (val); \
typeof(min) __min = (min); \
typeof(max) __max = (max); \
(void) (&__val == &__min); \
(void) (&__val == &__max); \
__val = __val < __min ? __min: __val; \
__val > __max ? __max: __val; })
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })

int fclamp(int v, int min, int max) { return clamp(v, min, max); }
int fclampmm(int v, int min, int max) { return min(max(v, min), max); }

$ gcc -O2 -o clamp.s -S clamp.c; cat clamp.s
.file "clamp.c"
.text
.p2align 4,,15
.globl fclamp
.type fclamp, @function
fclamp:
.LFB0:
.cfi_startproc
cmpl %edi, %esi
movl %edx, %eax
cmovge %esi, %edi
cmpl %edx, %edi
cmovle %edi, %eax
ret
.cfi_endproc
.LFE0:
.size fclamp, .-fclamp
.p2align 4,,15
.globl fclampmm
.type fclampmm, @function
fclampmm:
.LFB1:
.cfi_startproc
cmpl %edi, %esi
cmovge %esi, %edi
cmpl %edi, %edx
movl %edi, %eax
cmovle %edx, %eax
ret
.cfi_endproc
.LFE1:
.size fclampmm, .-fclampmm
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",@progbits

Furthermore, after “make allmodconfig && make bzImage modules” this is the
comparison of image and modules sizes:

# Without this patch applied
$ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
350715800

# With this patch applied
$ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
349856528

The above builds were done on:

$ uname -a; gcc --version
Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
Copyright (C) 2011 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

Signed-off-by: Michal Nazarewicz <[email protected]>
---
include/linux/kernel.h | 32 +++++---------------------------
1 file changed, 5 insertions(+), 27 deletions(-)

Interestingly commit [f27c85c56b: “add {min,max}3 macros”] claims
thatthe open-coded min3/max3 “will save some cycles as well as some
bytes on the stack”, but as far as I can see that statement is false.
But maybe there's something Hagen knows that I missed?

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c52907..44649e0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -719,23 +719,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })

-#define min3(x, y, z) ({ \
- typeof(x) _min1 = (x); \
- typeof(y) _min2 = (y); \
- typeof(z) _min3 = (z); \
- (void) (&_min1 == &_min2); \
- (void) (&_min1 == &_min3); \
- _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \
- (_min2 < _min3 ? _min2 : _min3); })
-
-#define max3(x, y, z) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- typeof(z) _max3 = (z); \
- (void) (&_max1 == &_max2); \
- (void) (&_max1 == &_max3); \
- _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \
- (_max2 > _max3 ? _max2 : _max3); })
+#define min3(x, y, z) min(min(x, y), z)
+#define max3(x, y, z) max(max(x, y), z)

/**
* min_not_zero - return the minimum that is _not_ zero, unless both are zero
@@ -750,20 +735,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
/**
* clamp - return a value clamped to a given range with strict typechecking
* @val: current value
- * @min: minimum allowable value
- * @max: maximum allowable value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
*
* This macro does strict typechecking of min/max to make sure they are of the
* same type as val. See the unnecessary pointer comparisons.
*/
-#define clamp(val, min, max) ({ \
- typeof(val) __val = (val); \
- typeof(min) __min = (min); \
- typeof(max) __max = (max); \
- (void) (&__val == &__min); \
- (void) (&__val == &__max); \
- __val = __val < __min ? __min: __val; \
- __val > __max ? __max: __val; })
+#define clamp(val, lo, hi) min(max(val, lo), hi)

/*
* ..and if you can't take the strict
--
2.0.0.526.g5318336


2014-06-16 23:18:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014 23:07:22 +0200 Michal Nazarewicz <[email protected]> wrote:

> It appears that gcc is better at optimising a double call to min
> and max rather than open coded min3 and max3. This can be observed
> here:
>
> ...
>
> Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> comparison of image and modules sizes:
>
> # Without this patch applied
> $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> 350715800
>
> # With this patch applied
> $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> 349856528

We saved nearly a megabyte by optimising min3(), max3() and clamp()?

I'm counting a grand total of 182 callsites for those macros. So the
saving is 4700 bytes per invokation? I don't believe it...

2014-06-16 23:25:19

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014, Andrew Morton wrote:

> > It appears that gcc is better at optimising a double call to min
> > and max rather than open coded min3 and max3. This can be observed
> > here:
> >
> > ...
> >
> > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > comparison of image and modules sizes:
> >
> > # Without this patch applied
> > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > 350715800
> >
> > # With this patch applied
> > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > 349856528
>
> We saved nearly a megabyte by optimising min3(), max3() and clamp()?
>
> I'm counting a grand total of 182 callsites for those macros. So the
> saving is 4700 bytes per invokation? I don't believe it...
>

I was checking just the instances of min3() in mm/ and gcc ends up
inlining transfer_objects() in mm/slab.c as a result of this change and
increases its text size:

text data bss dec hex filename
28369 21559 4 49932 c30c slab.o.before
28399 21559 4 49962 c32a slab.o.after

It also seems to use one additional temp variable of type typeof(x) on the
stack, so I do think the old version was superior.

2014-06-16 23:35:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014 16:25:15 -0700 (PDT) David Rientjes <[email protected]> wrote:

> On Mon, 16 Jun 2014, Andrew Morton wrote:
>
> > > It appears that gcc is better at optimising a double call to min
> > > and max rather than open coded min3 and max3. This can be observed
> > > here:
> > >
> > > ...
> > >
> > > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > > comparison of image and modules sizes:
> > >
> > > # Without this patch applied
> > > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > 350715800
> > >
> > > # With this patch applied
> > > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > 349856528
> >
> > We saved nearly a megabyte by optimising min3(), max3() and clamp()?
> >
> > I'm counting a grand total of 182 callsites for those macros. So the
> > saving is 4700 bytes per invokation? I don't believe it...
> >
>
> I was checking just the instances of min3() in mm/ and gcc ends up
> inlining transfer_objects() in mm/slab.c as a result of this change and
> increases its text size:
>
> text data bss dec hex filename
> 28369 21559 4 49932 c30c slab.o.before
> 28399 21559 4 49962 c32a slab.o.after

Maybe that's a good thing in disguise: gcc said "hey this thing is now
small enough to inline it".

> It also seems to use one additional temp variable of type typeof(x) on the
> stack, so I do think the old version was superior.

2014-06-16 23:54:37

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014, Andrew Morton wrote:

> On Mon, 16 Jun 2014 16:25:15 -0700 (PDT) David Rientjes <[email protected]> wrote:
>
> > On Mon, 16 Jun 2014, Andrew Morton wrote:
> >
> > > > It appears that gcc is better at optimising a double call to min
> > > > and max rather than open coded min3 and max3. This can be observed
> > > > here:
> > > >
> > > > ...
> > > >
> > > > Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
> > > > comparison of image and modules sizes:
> > > >
> > > > # Without this patch applied
> > > > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > > 350715800
> > > >
> > > > # With this patch applied
> > > > $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
> > > > 349856528
> > >
> > > We saved nearly a megabyte by optimising min3(), max3() and clamp()?
> > >
> > > I'm counting a grand total of 182 callsites for those macros. So the
> > > saving is 4700 bytes per invokation? I don't believe it...
> > >
> >
> > I was checking just the instances of min3() in mm/ and gcc ends up
> > inlining transfer_objects() in mm/slab.c as a result of this change and
> > increases its text size:
> >
> > text data bss dec hex filename
> > 28369 21559 4 49932 c30c slab.o.before
> > 28399 21559 4 49962 c32a slab.o.after
>
> Maybe that's a good thing in disguise: gcc said "hey this thing is now
> small enough to inline it".
>

On linux-next, allyesconfig has a 0.0001% savings as a result of the
patch, but I'd be worried about the extra temp variable it allocates on
the stack that is evident in the mm/slab.c disassembly unless all cases
can be audited to show that we're not potentially deep.

text data bss dec hex filename
108573045 23488016 51580928 183641989 af22785 vmlinux.before
108572908 23488016 51580928 183641852 af226fc vmlinux.after

2014-06-17 00:21:31

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014 16:54:32 -0700 (PDT)
David Rientjes <[email protected]> wrote:

>
>
> On linux-next, allyesconfig has a 0.0001% savings as a result of the
> patch, but I'd be worried about the extra temp variable it allocates on
> the stack that is evident in the mm/slab.c disassembly unless all cases
> can be audited to show that we're not potentially deep.

A 0.0001% change means it's not worth changing, and we may be able to
mark this up as a fluke in Michal's results.

I'll give it a try on my 4.6.3 compiler.

-- Steve

>
> text data bss dec hex filename
> 108573045 23488016 51580928 183641989 af22785 vmlinux.before
> 108572908 23488016 51580928 183641852 af226fc vmlinux.after

2014-06-17 04:01:31

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, 16 Jun 2014 20:21:20 -0400
Steven Rostedt <[email protected]> wrote:

> On Mon, 16 Jun 2014 16:54:32 -0700 (PDT)
> David Rientjes <[email protected]> wrote:
>
> >
> >
> > On linux-next, allyesconfig has a 0.0001% savings as a result of the
> > patch, but I'd be worried about the extra temp variable it allocates on
> > the stack that is evident in the mm/slab.c disassembly unless all cases
> > can be audited to show that we're not potentially deep.
>
> A 0.0001% change means it's not worth changing, and we may be able to
> mark this up as a fluke in Michal's results.
>
> I'll give it a try on my 4.6.3 compiler.
>
> -- Steve
>
> >
> > text data bss dec hex filename
> > 108573045 23488016 51580928 183641989 af22785 vmlinux.before
> > 108572908 23488016 51580928 183641852 af226fc vmlinux.after
>

Here's my results:

text data bss dec hex filename
108662851 23470256 51580928 183714035 af340f3 /tmp/vmlinux-orig
108662714 23470224 51580928 183713866 af3404a /tmp/vmlinux-patched

The patched version saved a total of 169 bytes.

Doing a diff on the vmlinux objdumps of do_fault_around, I get this:

mov -0x68(%rbp),%rdi
mov -0x70(%rbp),%r8d
- jae ffffffff81302bf5 <do_fault_around+0xbd>
- cmp %r15,%rax
- mov %r15,%r14
- cmovbe %rax,%r14
- jmp ffffffff81302c03 <do_fault_around+0xcb>
- cmp %r14,%rax
- cmovbe %rax,%r14
- incq 0x8d384b5(%rip) # ffffffff8a03b0b8 <high_memory+0x14e8>
- mov 0x8d384be(%rip),%rsi # ffffffff8a03b0c8 <high_memory+0x14f8>
- mov 0x8d384bf(%rip),%rdx # ffffffff8a03b0d0 <high_memory+0x1500>
- mov 0x8d384a8(%rip),%rax # ffffffff8a03b0c0 <high_memory+0x14f0>
+ cmp %rax,%r15
+ cmova %rax,%r15
+ mov 0x8d384c0(%rip),%rax # ffffffff8a03b0b8 <high_memory+0x14e8>
+ cmp %r14,%r15
+ cmovbe %r15,%r14
sub %r12,%rsi


So for gcc 4.6.3 it does seem to produce nicer assembly. I haven't
tried it with other versions though (or with clang).

But I'll still give it a:

Acked-by: Steven Rostedt <[email protected]>

-- Steve

2014-06-17 11:37:16

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

Wow, back in the days where I submitted the patch I saw improvement in
the generated code (focus on number of instructions). Don't know what
happends with gcc in the meantime. Anyway, patch and analysis looks
good. So from a clean-patch perspective you get an:

Signed-off-by: Hagen Paul Pfeifer <[email protected]>

2014-06-17 13:49:12

by Michal Nazarewicz

[permalink] [raw]
Subject: Re: [PATCH] include: kernel.h: rewrite min3, max3 and clamp using min and max

On Mon, Jun 16 2014, Andrew Morton <[email protected]> wrote:
> On Mon, 16 Jun 2014 23:07:22 +0200 Michal Nazarewicz <[email protected]> wrote:
>
>> It appears that gcc is better at optimising a double call to min
>> and max rather than open coded min3 and max3. This can be observed
>> here:
>>
>> ...
>>
>> Furthermore, after ___make allmodconfig && make bzImage modules___ this is the
>> comparison of image and modules sizes:
>>
>> # Without this patch applied
>> $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>> 350715800
>>
>> # With this patch applied
>> $ ls -l arch/x86/boot/bzImage **/*.ko |awk '{size += $5} END {print size}'
>> 349856528
>
> We saved nearly a megabyte by optimising min3(), max3() and clamp()?
>
> I'm counting a grand total of 182 callsites for those macros. So the
> saving is 4700 bytes per invokation? I don't believe it...

You're absolutely right. I must have messed something up here. This
portion of the commit message should be removed.

So I've redone this just on just the kernel image with allmodconfig:

Linux mpn-glaptop 3.13.0-29-generic #53~precise1-Ubuntu SMP Wed Jun 4 22:06:25 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3
Copyright (C) 2011 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

-rwx------ 1 mpn eng 51224656 Jun 17 14:15 vmlinux.before
-rwx------ 1 mpn eng 51224608 Jun 17 13:57 vmlinux.after

48 bytes reduction. The do_fault_around was a few instruction shorter
and as far as I can tell saved 12 bytes on the stack, i.e.:

$ grep -e rsp -e pop -e push do_fault_around.*
do_fault_around.before.s:push %rbp
do_fault_around.before.s:mov %rsp,%rbp
do_fault_around.before.s:push %r13
do_fault_around.before.s:push %r12
do_fault_around.before.s:push %rbx
do_fault_around.before.s:sub $0x38,%rsp
do_fault_around.before.s:add $0x38,%rsp
do_fault_around.before.s:pop %rbx
do_fault_around.before.s:pop %r12
do_fault_around.before.s:pop %r13
do_fault_around.before.s:pop %rbp

do_fault_around.after.s:push %rbp
do_fault_around.after.s:mov %rsp,%rbp
do_fault_around.after.s:push %r12
do_fault_around.after.s:push %rbx
do_fault_around.after.s:sub $0x30,%rsp
do_fault_around.after.s:add $0x30,%rsp
do_fault_around.after.s:pop %rbx
do_fault_around.after.s:pop %r12
do_fault_around.after.s:pop %rbp

or here side-by-side:

Before After
push %rbp push %rbp
mov %rsp,%rbp mov %rsp,%rbp
push %r13
push %r12 push %r12
push %rbx push %rbx
sub $0x38,%rsp sub $0x30,%rsp
add $0x38,%rsp add $0x30,%rsp
pop %rbx pop %rbx
pop %r12 pop %r12
pop %r13
pop %rbp pop %rbp

There are also fewer branches:

$ grep ^j do_fault_around.*
do_fault_around.before.s:jae ffffffff812079b7
do_fault_around.before.s:jmp ffffffff812079c5
do_fault_around.before.s:jmp ffffffff81207a14
do_fault_around.before.s:ja ffffffff812079f9
do_fault_around.before.s:jb ffffffff81207a10
do_fault_around.before.s:jmp ffffffff81207a63
do_fault_around.before.s:jne ffffffff812079df

do_fault_around.after.s:jmp ffffffff812079fd
do_fault_around.after.s:ja ffffffff812079e2
do_fault_around.after.s:jb ffffffff812079f9
do_fault_around.after.s:jmp ffffffff81207a4c
do_fault_around.after.s:jne ffffffff812079c8

And here's with allyesconfig on a different machine:

$ uname -a; gcc --version; ls -l vmlinux.*
Linux erwin 3.14.7-mn #54 SMP Sun Jun 15 11:25:08 CEST 2014 x86_64 AMD Phenom(tm) II X3 710 Processor AuthenticAMD GNU/Linux
gcc (GCC) 4.8.3
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

-rwx------ 1 mina86 mina86 230616126 Jun 17 15:39 vmlinux.before*
-rwx------ 1 mina86 mina86 230614861 Jun 17 14:36 vmlinux.after*

1265 bytes reduction.

--
Best regards, _ _
.o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o
..o | Computer Science, Michał “mina86” Nazarewicz (o o)
ooo +--<[email protected]>--<xmpp:[email protected]>--ooO--(_)--Ooo--