It seems gcc started to de-inline large functions.
We got bitten:
# sort -t ' ' +2 <System.map
...
c01876f0 t __cleanup_transaction
c0150f90 t __clear_page_buffers
c01d9fa0 T __clear_user
c011b020 T __cond_resched
c01d9ea0 T __const_udelay
c0107150 t __constant_c_and_count_memset
c0107fd0 t __constant_c_and_count_memset
[~seven screenfuls (!) snipped]
c010c7f0 t __constant_c_and_count_memset
c010e210 t __constant_c_and_count_memset
c03609e0 t __constant_c_and_count_memset
c03634f0 t __constant_c_and_count_memset
c0107ec0 t __constant_memcpy
c010fc10 t __constant_memcpy
[more snipped]
c0111410 t __constant_memcpy
c0111e10 t __constant_memcpy
c0114b50 t __constant_memcpy
c0353e50 t __constant_memcpy
c03573a0 t __constant_memcpy
c035bfc0 t __constant_memcpy
c03633e0 t __constant_memcpy
c01da040 T __copy_from_user
c011dd30 t __copy_fs_struct
c01da000 T __copy_to_user
This happens in 2.4.19 too.
Let's take exit.c compilation as an example:
gcc -Wp,-MD,kernel/.exit.o.d -D__KERNEL__ -Iinclude -Wall
-Wstrict-prototypes -Wno-trigraphs -O2 -fomit-frame-pointer
-fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2
-march=i486 -Iarch/i386/mach-generic -nostdinc -iwithprefix include
-DKBUILD_BASENAME=exit -c -o kernel/exit.o kernel/exit.c
[2.5] objdump -D exit.o, abridged
=================================
exit.o: file format elf32-i386
Disassembly of section .text:
...........
000004c0 <reparent_to_init>:
...........
5ab: 68 84 03 00 00 push $0x384
5b0: 50 push %eax
5b1: e8 0a 12 00 00 call 17c0 <__constant_memcpy>
Eek.... __constant_memcpy??! It carries this in string.h:
/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
and when it _does_ gets compiled it _does_ look horribly ugly:
000017c0 <__constant_memcpy>:
17c0: 57 push %edi
17c1: 56 push %esi
17c2: 51 push %ecx
17c3: 8b 4c 24 18 mov 0x18(%esp,1),%ecx
17c7: 83 f9 14 cmp $0x14,%ecx
17ca: 8b 54 24 10 mov 0x10(%esp,1),%edx
17ce: 8b 74 24 14 mov 0x14(%esp,1),%esi
17d2: 77 7c ja 1850 <__constant_memcpy+0x90>
17d4: ff 24 8d 28 01 00 00 jmp *0x128(,%ecx,4)
17db: 8a 06 mov (%esi),%al
17dd: 88 02 mov %al,(%edx)
17df: 90 nop
17e0: 89 d0 mov %edx,%eax
17e2: 5a pop %edx
17e3: 5e pop %esi
17e4: 5f pop %edi
17e5: c3 ret
17e6: 66 8b 06 mov (%esi),%ax
17e9: 66 89 02 mov %ax,(%edx)
17ec: eb f2 jmp 17e0 <__constant_memcpy+0x20>
17ee: 66 8b 06 mov (%esi),%ax
17f1: 66 89 02 mov %ax,(%edx)
17f4: 8a 46 02 mov 0x2(%esi),%al
17f7: 88 42 02 mov %al,0x2(%edx)
17fa: eb e4 jmp 17e0 <__constant_memcpy+0x20>
17fc: 8b 06 mov (%esi),%eax
17fe: 89 02 mov %eax,(%edx)
1800: eb de jmp 17e0 <__constant_memcpy+0x20>
1802: 8b 06 mov (%esi),%eax
1804: 89 02 mov %eax,(%edx)
1806: 66 8b 46 04 mov 0x4(%esi),%ax
180a: 66 89 42 04 mov %ax,0x4(%edx)
180e: eb d0 jmp 17e0 <__constant_memcpy+0x20>
1810: 8b 06 mov (%esi),%eax
1812: 89 02 mov %eax,(%edx)
1814: 8b 46 04 mov 0x4(%esi),%eax
1817: 89 42 04 mov %eax,0x4(%edx)
181a: eb c4 jmp 17e0 <__constant_memcpy+0x20>
181c: 8b 06 mov (%esi),%eax
181e: 89 02 mov %eax,(%edx)
1820: 8b 46 04 mov 0x4(%esi),%eax
1823: 89 42 04 mov %eax,0x4(%edx)
1826: 8b 46 08 mov 0x8(%esi),%eax
1829: 89 42 08 mov %eax,0x8(%edx)
182c: eb b2 jmp 17e0 <__constant_memcpy+0x20>
182e: 8b 06 mov (%esi),%eax
1830: 89 02 mov %eax,(%edx)
1832: 8b 46 04 mov 0x4(%esi),%eax
1835: 89 42 04 mov %eax,0x4(%edx)
1838: 8b 46 08 mov 0x8(%esi),%eax
183b: 89 42 08 mov %eax,0x8(%edx)
183e: 8b 46 0c mov 0xc(%esi),%eax
1841: 89 42 0c mov %eax,0xc(%edx)
1844: eb 9a jmp 17e0 <__constant_memcpy+0x20>
1846: 8d 76 00 lea 0x0(%esi),%esi
1849: 8d bc 27 00 00 00 00 lea 0x0(%edi,1),%edi
1850: 89 c8 mov %ecx,%eax
1852: 83 e0 03 and $0x3,%eax
1855: 83 f8 01 cmp $0x1,%eax
1858: 74 46 je 18a0 <__constant_memcpy+0xe0>
185a: 83 f8 01 cmp $0x1,%eax
185d: 72 31 jb 1890 <__constant_memcpy+0xd0>
185f: 83 f8 02 cmp $0x2,%eax
1862: 74 0f je 1873 <__constant_memcpy+0xb3>
1864: c1 e9 02 shr $0x2,%ecx
1867: 89 d7 mov %edx,%edi
1869: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
186b: 66 a5 movsw %ds:(%esi),%es:(%edi)
186d: a4 movsb %ds:(%esi),%es:(%edi)
186e: e9 6d ff ff ff jmp 17e0 <__constant_memcpy+0x20>
1873: c1 e9 02 shr $0x2,%ecx
1876: 89 d7 mov %edx,%edi
1878: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
187a: 66 a5 movsw %ds:(%esi),%es:(%edi)
187c: e9 5f ff ff ff jmp 17e0 <__constant_memcpy+0x20>
1881: eb 0d jmp 1890 <__constant_memcpy+0xd0>
1883: 90 nop
1884: 90 nop
1885: 90 nop
1886: 90 nop
1887: 90 nop
1888: 90 nop
BTW, this string of nops is deeply unfunny...
looks similar to inter-function 16 byte alignment,
but it's _inside_ of a function.
GCC aligned branch target?? :(((
1889: 90 nop
188a: 90 nop
188b: 90 nop
188c: 90 nop
188d: 90 nop
188e: 90 nop
188f: 90 nop
1890: c1 e9 02 shr $0x2,%ecx
1893: 89 d7 mov %edx,%edi
1895: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
1897: e9 44 ff ff ff jmp 17e0 <__constant_memcpy+0x20>
189c: 8d 74 26 00 lea 0x0(%esi,1),%esi
18a0: c1 e9 02 shr $0x2,%ecx
18a3: 89 d7 mov %edx,%edi
18a5: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
18a7: a4 movsb %ds:(%esi),%es:(%edi)
18a8: e9 33 ff ff ff jmp 17e0 <__constant_memcpy+0x20>
18ad: 8b 06 mov (%esi),%eax
18af: 89 02 mov %eax,(%edx)
18b1: 8b 46 04 mov 0x4(%esi),%eax
18b4: 89 42 04 mov %eax,0x4(%edx)
18b7: 8b 46 08 mov 0x8(%esi),%eax
18ba: 89 42 08 mov %eax,0x8(%edx)
18bd: 8b 46 0c mov 0xc(%esi),%eax
18c0: 89 42 0c mov %eax,0xc(%edx)
18c3: 8b 46 10 mov 0x10(%esi),%eax
18c6: 89 42 10 mov %eax,0x10(%edx)
18c9: e9 12 ff ff ff jmp 17e0 <__constant_memcpy+0x20>
18ce: 89 f6 mov %esi,%esi
--
vda
On 3 November 2002 14:17, Denis Vlasenko wrote:
> It seems gcc started to de-inline large functions.
> We got bitten:
>
> # sort -t ' ' +2 <System.map
> ...
> c01876f0 t __cleanup_transaction
> c0150f90 t __clear_page_buffers
> c01d9fa0 T __clear_user
> c011b020 T __cond_resched
> c01d9ea0 T __const_udelay
> c0107150 t __constant_c_and_count_memset
> c0107fd0 t __constant_c_and_count_memset
>
> [~seven screenfuls (!) snipped]
>
> c010c7f0 t __constant_c_and_count_memset
> c010e210 t __constant_c_and_count_memset
> c03609e0 t __constant_c_and_count_memset
> c03634f0 t __constant_c_and_count_memset
> c0107ec0 t __constant_memcpy
> c010fc10 t __constant_memcpy
Here is the cure: force_inline will guarantee inlining.
To use _only_ with functions which meant to be almost
optimized away to nothing but are large and gcc might decide
they are _too_ large for inlining.
More to come if ok'ed for inclusion.
--
vda
diff -urN linux-2.5.45.orig/include/asm-i386/string.h linux-2.5.45fix/include/asm-i386/string.h
--- linux-2.5.45.orig/include/asm-i386/string.h Wed Oct 30 22:43:46 2002
+++ linux-2.5.45fix/include/asm-i386/string.h Sun Nov 3 15:58:08 2002
@@ -3,6 +3,7 @@
#ifdef __KERNEL__
#include <linux/config.h>
+#include <linux/compiler.h>
/*
* On a 486 or Pentium, we are better off not using the
* byte string operations. But on a 386 or a PPro the
@@ -218,7 +219,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
-static inline void * __constant_memcpy(void * to, const void * from, size_t n)
+static force_inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
@@ -453,7 +454,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
-static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+static force_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
diff -urN linux-2.5.45.orig/include/linux/compiler.h linux-2.5.45fix/include/linux/compiler.h
--- linux-2.5.45.orig/include/linux/compiler.h Wed Oct 30 22:43:05 2002
+++ linux-2.5.45fix/include/linux/compiler.h Sun Nov 3 15:19:20 2002
@@ -20,3 +20,11 @@
__asm__ ("" : "=g"(__ptr) : "0"(ptr)); \
(typeof(ptr)) (__ptr + (off)); })
#endif /* __LINUX_COMPILER_H */
+
+/* GCC 3 (and probably earlier, I'm not sure) can be told to always inline
+ a function. */
+#if __GNUC__ < 3
+#define force_inline inline
+#else
+#define force_inline inline __attribute__ ((always_inline))
+#endif
> Here is the cure: force_inline will guarantee inlining.
>
> To use _only_ with functions which meant to be almost
> optimized away to nothing but are large and gcc might decide
> they are _too_ large for inlining.
So aside from the ugliness of code, which one actually runs faster?
M.
On Sun, Nov 03, 2002 at 04:14:26PM -0200, Denis Vlasenko wrote:
> Here is the cure: force_inline will guarantee inlining.
>
> To use _only_ with functions which meant to be almost
> optimized away to nothing but are large and gcc might decide
> they are _too_ large for inlining.
Well, you can as well bump -finline-limit, like -finline-limit=2000.
The default is too low for kernel code (and glibc too).
Jakub
On Sun, 2002-11-03 at 15:37, Jakub Jelinek wrote:
> On Sun, Nov 03, 2002 at 04:14:26PM -0200, Denis Vlasenko wrote:
> > Here is the cure: force_inline will guarantee inlining.
> >
> > To use _only_ with functions which meant to be almost
> > optimized away to nothing but are large and gcc might decide
> > they are _too_ large for inlining.
>
> Well, you can as well bump -finline-limit, like -finline-limit=2000.
> The default is too low for kernel code (and glibc too).
I would venture the reverse interpretation for modern processors, the
kernel inlines far far too much
On Sun, 2002-11-03 at 18:17, Denis Vlasenko wrote:
Jump target 17e0 is aligned (with nops):
> 17dd: 88 02 mov %al,(%edx)
> 17df: 90 nop
> 17e0: 89 d0 mov %edx,%eax
> 17e2: 5a pop %edx
> 17ec: eb f2 jmp 17e0 <__constant_memcpy+0x20>
> 17fa: eb e4 jmp 17e0 <__constant_memcpy+0x20>
> 1800: eb de jmp 17e0 <__constant_memcpy+0x20>
> 187c: e9 5f ff ff ff jmp 17e0 <__constant_memcpy+0x20>
> 1881: eb 0d jmp 1890 <__constant_memcpy+0xd0>
> 1883: 90 nop
...
> 188f: 90 nop
> 1890: c1 e9 02 shr $0x2,%ecx
> 1893: 89 d7 mov %edx,%edi
And also jump target 1890 is aligned.
I think the penalty of few NOPs is smaller than result of jump to
unaligned address. This is especially true with P4 architecture.
- Jussi Laako
On 3 November 2002 14:17, Jussi Laako wrote:
> On Sun, 2002-11-03 at 18:17, Denis Vlasenko wrote:
>
> Jump target 17e0 is aligned (with nops):
> > 17dd: 88 02 mov %al,(%edx)
> > 17df: 90 nop
> > 17e0: 89 d0 mov %edx,%eax
> > 17e2: 5a pop %edx
> >
> > 17ec: eb f2 jmp 17e0
> > <__constant_memcpy+0x20>
> >
> > 17fa: eb e4 jmp 17e0
> > <__constant_memcpy+0x20>
> >
> > 1800: eb de jmp 17e0
> > <__constant_memcpy+0x20>
> >
> > 187c: e9 5f ff ff ff jmp 17e0
> > <__constant_memcpy+0x20> 1881: eb 0d jmp 1890
> > <__constant_memcpy+0xd0> 1883: 90 nop
>
> ...
>
> > 188f: 90 nop
> > 1890: c1 e9 02 shr $0x2,%ecx
> > 1893: 89 d7 mov %edx,%edi
>
> And also jump target 1890 is aligned.
>
>
> I think the penalty of few NOPs is smaller than result of jump to
> unaligned address. This is especially true with P4 architecture.
Alignment does not eliminate jump. It only moves jump target to 16 byte
boundary. This _probably_ makes execution slightly faster but on average
it costs you 7,5 bytes. This price is too high when you take into account
L1 instruction cache wastage and current bus/core clock ratios.
--
vda
On 3 November 2002 13:23, Martin J. Bligh wrote:
> > Here is the cure: force_inline will guarantee inlining.
> >
> > To use _only_ with functions which meant to be almost
> > optimized away to nothing but are large and gcc might decide
> > they are _too_ large for inlining.
>
> So aside from the ugliness of code, which one actually runs faster?
This can *never* run faster non-inlined because it is called with
n being compile time constant:
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
return to;
case 1:
*(unsigned char *)to = *(const unsigned char *)from;
return to;
case 2:
*(unsigned short *)to = *(const unsigned short *)from;
return to;
case 3:
*(unsigned short *)to = *(const unsigned short *)from;
*(2+(unsigned char *)to) = *(2+(const unsigned char *)from);
return to;
case 4:
*(unsigned long *)to = *(const unsigned long *)from;
return to;
case 6: /* for Ethernet addresses */
*(unsigned long *)to = *(const unsigned long *)from;
*(2+(unsigned short *)to) = *(2+(const unsigned short *)from);
return to;
case 8:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
return to;
case 12:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
return to;
case 16:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
return to;
case 20:
*(unsigned long *)to = *(const unsigned long *)from;
*(1+(unsigned long *)to) = *(1+(const unsigned long *)from);
*(2+(unsigned long *)to) = *(2+(const unsigned long *)from);
*(3+(unsigned long *)to) = *(3+(const unsigned long *)from);
*(4+(unsigned long *)to) = *(4+(const unsigned long *)from);
return to;
}
#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}
--
vda
On 3 November 2002 14:21, Alan Cox wrote:
> On Sun, 2002-11-03 at 15:37, Jakub Jelinek wrote:
> > On Sun, Nov 03, 2002 at 04:14:26PM -0200, Denis Vlasenko wrote:
> > > Here is the cure: force_inline will guarantee inlining.
> > >
> > > To use _only_ with functions which meant to be almost
> > > optimized away to nothing but are large and gcc might decide
> > > they are _too_ large for inlining.
> >
> > Well, you can as well bump -finline-limit, like
> > -finline-limit=2000. The default is too low for kernel code (and
> > glibc too).
>
> I would venture the reverse interpretation for modern processors, the
> kernel inlines far far too much
I agree that there are far too many large inlines. But.
__constant_c_and_count_memset *has to* be inlined.
There is large switch statement which meant to be optimized out.
It does optimize out *only if* count is compile-time constant.
--
vda
On Mon, 2002-11-04 at 00:20, Denis Vlasenko wrote:
> __constant_c_and_count_memset *has to* be inlined.
> There is large switch statement which meant to be optimized out.
> It does optimize out *only if* count is compile-time constant.
gcc already allows you to check for constants that means you can
generate
(is a constant ? inline: noninline)(to, from, len)
and I would hope gcc does the right thing. Similarly your switch will
be optimised out so the default case for the constant one can be the
invocation of uninlined code and it should optimise back to the straight
expected call.
Alan
On Mon, 2002-11-04 at 02:17, Denis Vlasenko wrote:
> Alignment does not eliminate jump. It only moves jump target to 16 byte
> boundary.
Exactly. And P4 cache is _very_ bad at anything not 16-byte aligned. The
speed penalty is big. This seems to be problem only with Intel CPU's, no
such large effects on AMD ones.
> This _probably_ makes execution slightly faster but on average
> it costs you 7,5 bytes. This price is too high when you take into account
> L1 instruction cache wastage and current bus/core clock ratios.
7.5 bytes is not much compared to possibility of trashed cache or
pipeline flush.
Do you have execution time numbers of jump to 16-byte aligned address vs
unaligned address?
- Jussi Laako
On Sun, 2002-11-03 at 11:21, Alan Cox wrote:
> I would venture the reverse interpretation for modern processors,
> the kernel inlines far far too much
I agree 100% we mark too much as inline - but at the same time, some
larger functions are inline simply because they were originally part of
another function and pulled out for cleanliness. They are only used
once... in other words, in some cases I hope the kernel developers know
what they are doing when they mark stuff inline.
So I can see why bumping this limit very high is a good thing. At the
same time, there is a lot of stuff incorrectly inlined and we should go
through and un-inline all of that, too.
Robert Love
On 3 November 2002 19:28, Jussi Laako wrote:
> On Mon, 2002-11-04 at 02:17, Denis Vlasenko wrote:
> > Alignment does not eliminate jump. It only moves jump target to 16
> > byte boundary.
>
> Exactly. And P4 cache is _very_ bad at anything not 16-byte aligned.
> The speed penalty is big. This seems to be problem only with Intel
> CPU's, no such large effects on AMD ones.
So we are going to waste that space in _all_ kernels just for Intel P4
being stupid? I compile fernels for _486_! At home I have a Duron.
I don't want to pay "P4 tax" ;)
Also, once it gets cached at first access, subsequent accesses won't
hurt that much, right?
> > This _probably_ makes execution slightly faster but on average
> > it costs you 7,5 bytes. This price is too high when you take into
> > account L1 instruction cache wastage and current bus/core clock
> > ratios.
>
> 7.5 bytes is not much compared to possibility of trashed cache or
> pipeline flush.
You definitely need to play with objdump -d just to see those 7.5 bytes
everywhere.
Pipeline flush is moot, you have jump penalty regrdless of alignment.
> Do you have execution time numbers of jump to 16-byte aligned address
> vs unaligned address?
I will do.
Let me say this clear:
If you do million iterations loop, maybe you should align it.
It is very dumb to align everything in the hope it will
magically make your kernel run 2x faster.
--
vda
On 3 November 2002 18:28, Alan Cox wrote:
> On Mon, 2002-11-04 at 00:20, Denis Vlasenko wrote:
> > __constant_c_and_count_memset *has to* be inlined.
> > There is large switch statement which meant to be optimized out.
> > It does optimize out *only if* count is compile-time constant.
>
> gcc already allows you to check for constants that means you can
> generate
>
> (is a constant ? inline: noninline)(to, from, len)
^^^^^^
>
> and I would hope gcc does the right thing. Similarly your switch will
> be optimised out so the default case for the constant one can be the
> invocation of uninlined code and it should optimise back to the
> straight expected call.
Yes. That is how it is done right now in 2.4 and 2.5. The problem is
that gcc 3.2 did *not* inline __constant_c_and_count_memset()
as we all expected.
What's the verdict on my patch?
--
vda
diff -urN linux-2.5.45.orig/include/asm-i386/string.h linux-2.5.45fix/include/asm-i386/string.h
--- linux-2.5.45.orig/include/asm-i386/string.h Wed Oct 30 22:43:46 2002
+++ linux-2.5.45fix/include/asm-i386/string.h Sun Nov 3 15:58:08 2002
@@ -3,6 +3,7 @@
#ifdef __KERNEL__
#include <linux/config.h>
+#include <linux/compiler.h>
/*
* On a 486 or Pentium, we are better off not using the
* byte string operations. But on a 386 or a PPro the
@@ -218,7 +219,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
-static inline void * __constant_memcpy(void * to, const void * from, size_t n)
+static force_inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
@@ -453,7 +454,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
-static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+static force_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
diff -urN linux-2.5.45.orig/include/linux/compiler.h linux-2.5.45fix/include/linux/compiler.h
--- linux-2.5.45.orig/include/linux/compiler.h Wed Oct 30 22:43:05 2002
+++ linux-2.5.45fix/include/linux/compiler.h Sun Nov 3 15:19:20 2002
@@ -20,3 +20,11 @@
__asm__ ("" : "=g"(__ptr) : "0"(ptr)); \
(typeof(ptr)) (__ptr + (off)); })
#endif /* __LINUX_COMPILER_H */
+
+/* GCC 3 (and probably earlier, I'm not sure) can be told to always inline
+ a function. */
+#if __GNUC__ < 3
+#define force_inline inline
+#else
+#define force_inline inline __attribute__ ((always_inline))
+#endif
On 3 November 2002 23:21, Robert Love wrote:
> On Sun, 2002-11-03 at 11:21, Alan Cox wrote:
> > I would venture the reverse interpretation for modern processors,
> > the kernel inlines far far too much
>
> I agree 100% we mark too much as inline - but at the same time, some
> larger functions are inline simply because they were originally part
> of another function and pulled out for cleanliness. They are only
> used once... in other words, in some cases I hope the kernel
> developers know what they are doing when they mark stuff inline.
That's ok, but when function gets large inlining benefits are lost
in the noise.
But one day someone will add another call site. And sometimes people
forget to remove 'inline'. Then we lose big time.
> So I can see why bumping this limit very high is a good thing.
Frankly, I'd say we should not inline anything large
regardless of number of call sites, for reasons outlined above.
The limit is designed exactly for this purpose I think.
--
vda
On Mon, Nov 04, 2002 at 02:04:42PM -0200, Denis Vlasenko wrote:
> Frankly, I'd say we should not inline anything large
> regardless of number of call sites, for reasons outlined above.
>
> The limit is designed exactly for this purpose I think.
But the limit doesn't work that way. First of all, the limit
ATM is O(tree nodes) where it is not known how many tree nodes
will be one instruction or how many instructions will one tree node expand
into. That all depends on the exact code being inlined and how well
can it be optimized. Nice example are kernel's constant stringop inlines.
And also, there is the problem with inlining from inlines, it may very well
happen that gcc will inline a big function which is almost at the limit
boundary, but then not inline any of tiny functions which should be inlined
in it.
As kernel is using inline explicitely and not -O3, I think best would be
to use bigger inline limit and remove inline keywords from big functions
which should not be inlined.
Of course, improving gcc tree inliner is very desirable too.
Jakub
On Mon, 2002-11-04 at 01:21, Robert Love wrote:
> I agree 100% we mark too much as inline - but at the same time, some
> larger functions are inline simply because they were originally part of
> another function and pulled out for cleanliness. They are only used
> once... in other words, in some cases I hope the kernel developers know
> what they are doing when they mark stuff inline.
gcc actually appears to know about that case for static functions.
Alan Cox wrote:
> gcc actually appears to know about that case for static functions.
Hmm. gcc 3.1 inlines functions even without being marked "inline"
when using -O3 or higher. It doesn't seem to special-case plain
"static", or even single-use "static".
Is gcc 3.2 better at this ?
-falways-inline-static-singletons would be kind of nice to have.
- Werner
--
_________________________________________________________________________
/ Werner Almesberger, Buenos Aires, Argentina [email protected] /
/_http://www.almesberger.net/____________________________________________/
(sorry to answer to not-final-version mail, but didn't keep the last one.
This also applies, anyway...)
On 2002.11.03 Denis Vlasenko wrote:
> On 3 November 2002 14:17, Denis Vlasenko wrote:
> > It seems gcc started to de-inline large functions.
[...]
> diff -urN linux-2.5.45.orig/include/linux/compiler.h linux-2.5.45fix/include/linux/compiler.h
> --- linux-2.5.45.orig/include/linux/compiler.h Wed Oct 30 22:43:05 2002
> +++ linux-2.5.45fix/include/linux/compiler.h Sun Nov 3 15:19:20 2002
> @@ -20,3 +20,11 @@
> __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \
> (typeof(ptr)) (__ptr + (off)); })
> #endif /* __LINUX_COMPILER_H */
> +
> +/* GCC 3 (and probably earlier, I'm not sure) can be told to always inline
> + a function. */
> +#if __GNUC__ < 3
> +#define force_inline inline
> +#else
> +#define force_inline inline __attribute__ ((always_inline))
> +#endif
This should go before the #endif /* __LINUX_COMPILER_H */, isn't it ?
--
J.A. Magallon <[email protected]> \ Software is like sex:
werewolf.able.es \ It's better when it's free
Mandrake Linux release 9.1 (Cooker) for i586
Linux 2.4.20-rc1-jam2 (gcc 3.2 (Mandrake Linux 9.1 3.2-3mdk))
On 2002.11.04 Denis Vlasenko wrote:
[...]
> __constant_c_and_count_memset *has to* be inlined.
> There is large switch statement which meant to be optimized out.
> It does optimize out *only if* count is compile-time constant.
I also got tons of __copy_to_user and __copy_from_user in 2.4.
Do not show in 2.5 ?
This i what I applied to my 2.4 tree:
diff -urN linux-2.5.45.orig/include/linux/compiler.h linux-2.5.45fix/include/linux/compiler.h
--- linux-2.5.45.orig/include/linux/compiler.h Wed Oct 30 22:43:05 2002
+++ linux-2.5.45fix/include/linux/compiler.h Sun Nov 3 15:19:20 2002
@@ -13,4 +13,12 @@
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
+/* GCC 3 (and probably earlier, I'm not sure) can be told to always inline
+ a function. */
+#if __GNUC__ < 3
+#define force_inline inline
+#else
+#define force_inline inline __attribute__ ((always_inline))
+#endif
+
#endif /* __LINUX_COMPILER_H */
diff -urN linux-2.5.45.orig/include/asm-i386/string.h linux-2.5.45fix/include/asm-i386/string.h
--- linux-2.5.45.orig/include/asm-i386/string.h Wed Oct 30 22:43:46 2002
+++ linux-2.5.45fix/include/asm-i386/string.h Sun Nov 3 15:58:08 2002
@@ -3,6 +3,7 @@
#ifdef __KERNEL__
#include <linux/config.h>
+#include <linux/compiler.h>
/*
* On a 486 or Pentium, we are better off not using the
* byte string operations. But on a 386 or a PPro the
@@ -218,7 +219,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
-static inline void * __constant_memcpy(void * to, const void * from, size_t n)
+static force_inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
switch (n) {
case 0:
@@ -453,7 +454,7 @@
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
-static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+static force_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
--- linux/include/asm-i386/uaccess.h.orig 2002-11-13 01:27:42.000000000 +0100
+++ linux/include/asm-i386/uaccess.h 2002-11-13 01:30:04.000000000 +0100
@@ -543,7 +543,7 @@
unsigned long __generic_copy_to_user(void *, const void *, unsigned long);
unsigned long __generic_copy_from_user(void *, const void *, unsigned long);
-static inline unsigned long
+static force_inline unsigned long
__constant_copy_to_user(void *to, const void *from, unsigned long n)
{
prefetch(from);
@@ -552,7 +552,7 @@
return n;
}
-static inline unsigned long
+static force_inline unsigned long
__constant_copy_from_user(void *to, const void *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
@@ -562,14 +562,14 @@
return n;
}
-static inline unsigned long
+static force_inline unsigned long
__constant_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
{
__constant_copy_user(to,from,n);
return n;
}
-static inline unsigned long
+static force_inline unsigned long
__constant_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
{
__constant_copy_user_zeroing(to,from,n);
--
J.A. Magallon <[email protected]> \ Software is like sex:
werewolf.able.es \ It's better when it's free
Mandrake Linux release 9.1 (Cooker) for i586
Linux 2.4.20-rc1-jam2 (gcc 3.2 (Mandrake Linux 9.1 3.2-3mdk))
On 12 November 2002 23:28, J.A. Magall?n wrote:
> On 2002.11.04 Denis Vlasenko wrote:
> [...]
>
> > __constant_c_and_count_memset *has to* be inlined.
> > There is large switch statement which meant to be optimized out.
> > It does optimize out *only if* count is compile-time constant.
>
> I also got tons of __copy_to_user and __copy_from_user in 2.4.
> Do not show in 2.5 ?
I haven't looked.
Maybe GCC can be told to emit warnings when it does this...
> This i what I applied to my 2.4 tree:
> [snip]
Although patch seems to achieve desired effect, it isn't
included in 2.5 yet (last time I checked). I'd like to know
verdict for 2.5 first.
--
vda
On 12 November 2002 22:10, J.A. Magall?n wrote:
> (sorry to answer to not-final-version mail, but didn't keep the last
> one. This also applies, anyway...)
>
> On 2002.11.03 Denis Vlasenko wrote:
> > On 3 November 2002 14:17, Denis Vlasenko wrote:
> > > It seems gcc started to de-inline large functions.
>
> [...]
>
> > diff -urN linux-2.5.45.orig/include/linux/compiler.h
> > linux-2.5.45fix/include/linux/compiler.h ---
> > linux-2.5.45.orig/include/linux/compiler.h Wed Oct 30 22:43:05 2002
> > +++ linux-2.5.45fix/include/linux/compiler.h Sun Nov 3 15:19:20
> > 2002 @@ -20,3 +20,11 @@
> > __asm__ ("" : "=g"(__ptr) : "0"(ptr)); \
> > (typeof(ptr)) (__ptr + (off)); })
> > #endif /* __LINUX_COMPILER_H */
> > +
> > +/* GCC 3 (and probably earlier, I'm not sure) can be told to
> > always inline + a function. */
> > +#if __GNUC__ < 3
> > +#define force_inline inline
> > +#else
> > +#define force_inline inline __attribute__ ((always_inline))
> > +#endif
>
> This should go before the #endif /* __LINUX_COMPILER_H */, isn't it ?
Oh... You are right...
--
vda
On 13 November 2002 09:54, Denis Vlasenko wrote:
> On 12 November 2002 23:28, J.A. Magall?n wrote:
> > On 2002.11.04 Denis Vlasenko wrote:
> > [...]
> >
> > > __constant_c_and_count_memset *has to* be inlined.
> > > There is large switch statement which meant to be optimized out.
> > > It does optimize out *only if* count is compile-time constant.
> >
> > I also got tons of __copy_to_user and __copy_from_user in 2.4.
> > Do not show in 2.5 ?
>
> I haven't looked.
I looked there, there is no _constant_XXX inlines in 2.5's uaccess.h
BTW, what a joy seeing sane asm formatting there!
No sight of '\t' ;)
BTW #2: in order to have sane asm formatting _and_ pretty asm
generated by GCC one would add "\n" after 'asm(':
__asm__ __volatile__("\n" <---------------
"1: mov a,b\n"
.....
--
vda