2015-05-11 17:58:09

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH] force inlining of spinlock ops

With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. In particular,
with this config: http://busybox.net/~vda/kernel_config
there are more than a thousand copies of tiny spinlock-related functions:

$ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
473 000000000000000b t spin_unlock_irqrestore
292 000000000000000b t spin_unlock
215 000000000000000b t spin_lock
134 000000000000000b t spin_unlock_irq
130 000000000000000b t spin_unlock_bh
120 000000000000000b t spin_lock_irq
106 000000000000000b t spin_lock_bh

Disassembly:

ffffffff81004720 <spin_lock>:
ffffffff81004720: 55 push %rbp
ffffffff81004721: 48 89 e5 mov %rsp,%rbp
ffffffff81004724: e8 f8 4e e2 02 callq <_raw_spin_lock>
ffffffff81004729: 5d pop %rbp
ffffffff8100472a: c3 retq

This patch fixes this via s/inline/__always_inline/ in spinlock.h.
This decreases vmlinux by about 30k:

text data bss dec hex filename
82375570 22255544 20627456 125258570 7774b4a vmlinux.before
82335059 22255416 20627456 125217931 776ac8b vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: David S. Miller <[email protected]>
Cc: Bart Van Assche <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: David S. Miller <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Paul E. McKenney <[email protected]>
CC: [email protected]
---
include/linux/spinlock.h | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379..073925d 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
* Map the spin_lock functions to the raw variants for PREEMPT_RT=n
*/

-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
return &lock->rlock;
}
@@ -307,17 +307,17 @@ do { \
raw_spin_lock_init(&(_lock)->rlock); \
} while (0)

-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
{
raw_spin_lock(&lock->rlock);
}

-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
{
raw_spin_lock_bh(&lock->rlock);
}

-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
{
return raw_spin_trylock(&lock->rlock);
}
@@ -337,7 +337,7 @@ do { \
raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \
} while (0)

-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
{
raw_spin_lock_irq(&lock->rlock);
}
@@ -352,32 +352,32 @@ do { \
raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
{
raw_spin_unlock(&lock->rlock);
}

-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
raw_spin_unlock_bh(&lock->rlock);
}

-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
raw_spin_unlock_irq(&lock->rlock);
}

-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
return raw_spin_trylock_bh(&lock->rlock);
}

-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
return raw_spin_trylock_irq(&lock->rlock);
}
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
{
raw_spin_unlock_wait(&lock->rlock);
}

-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
{
return raw_spin_is_locked(&lock->rlock);
}

-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
{
return raw_spin_is_contended(&lock->rlock);
}

-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
{
return raw_spin_can_lock(&lock->rlock);
}
--
1.8.1.4


2015-05-11 18:54:18

by Josh Triplett

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On Mon, May 11, 2015 at 07:57:22PM +0200, Denys Vlasenko wrote:
> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:
>
> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
> 473 000000000000000b t spin_unlock_irqrestore
> 292 000000000000000b t spin_unlock
> 215 000000000000000b t spin_lock
> 134 000000000000000b t spin_unlock_irq
> 130 000000000000000b t spin_unlock_bh
> 120 000000000000000b t spin_lock_irq
> 106 000000000000000b t spin_lock_bh
>
> Disassembly:
>
> ffffffff81004720 <spin_lock>:
> ffffffff81004720: 55 push %rbp
> ffffffff81004721: 48 89 e5 mov %rsp,%rbp
> ffffffff81004724: e8 f8 4e e2 02 callq <_raw_spin_lock>
> ffffffff81004729: 5d pop %rbp
> ffffffff8100472a: c3 retq

Frame pointers make this even more awful, since without them this could
just become a single jmp. (Assuming _raw_spin_lock shouldn't be
inlined too.)

> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
> This decreases vmlinux by about 30k:
>
> text data bss dec hex filename
> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
> 82335059 22255416 20627456 125217931 776ac8b vmlinux

Nice improvement. Given that this actually makes the kernel *smaller*,
presumably in addition to faster, this forced inlining seems completely
reasonable.

> Signed-off-by: Denys Vlasenko <[email protected]>
> Cc: Thomas Graf <[email protected]>
> Cc: David S. Miller <[email protected]>
> Cc: Bart Van Assche <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: David Rientjes <[email protected]>
> Cc: David S. Miller <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Oleg Nesterov <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> CC: [email protected]

Reviewed-by: Josh Triplett <[email protected]>

2015-05-11 22:19:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On Mon, 11 May 2015 19:57:22 +0200 Denys Vlasenko <[email protected]> wrote:

> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:
>
> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
> 473 000000000000000b t spin_unlock_irqrestore
> 292 000000000000000b t spin_unlock
> 215 000000000000000b t spin_lock
> 134 000000000000000b t spin_unlock_irq
> 130 000000000000000b t spin_unlock_bh
> 120 000000000000000b t spin_lock_irq
> 106 000000000000000b t spin_lock_bh
>
> Disassembly:
>
> ffffffff81004720 <spin_lock>:
> ffffffff81004720: 55 push %rbp
> ffffffff81004721: 48 89 e5 mov %rsp,%rbp
> ffffffff81004724: e8 f8 4e e2 02 callq <_raw_spin_lock>
> ffffffff81004729: 5d pop %rbp
> ffffffff8100472a: c3 retq
>
> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
> This decreases vmlinux by about 30k:
>
> text data bss dec hex filename
> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
> 82335059 22255416 20627456 125217931 776ac8b vmlinux

See also https://lkml.org/lkml/2015/4/23/598 ("enforce function
inlining for hot functions").

Presumably Hagen didn't see the issue with spinlock functions. I
wonder why not.

I suppose we should get both these consolidated into a coherent whole.

It's a bit irritating to have to do this: presumably gcc will get fixed
and the huge sprinkling of __always_inline will become less and less
relevant over time and people will have trouble distinguishing "real
__always_inline which was put here for a purpose" from "dopey
__always_inline to work around a short-term gcc glitch".

__always_inline is one of those things where a usage site should always
be commented, because it's near impossible to work out why someone
chose to use it. Quick, tell me what's happening in include/linux/slab.h.




Perhaps we should do

/*
* Comment goes here. It is very specific about gcc versions.
*/
#define inline_for_broken_gcc __always_inline

and then use inline_for_broken_gcc everywhere. That way, the reason
for the marker is self-explanatory and we can later hunt all these
things down and remvoe them.

Also, the inline_for_broken_gcc definition can be made dependent on
particular gcc versions, which will allow us to easily keep an eye on
the behaviour of later gcc versions.

2015-05-12 07:44:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops


* Denys Vlasenko <[email protected]> wrote:

> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> very small functions we expect to be inlined. In particular,
> with this config: http://busybox.net/~vda/kernel_config
> there are more than a thousand copies of tiny spinlock-related functions:

That's an x86-64 allyesconfig AFAICS, right?

It's not mysterious, but an effect of -Os plus allowing GCC to do
inlining heuristics:

CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_OPTIMIZE_INLINING=y

Does the problem go away if you unset of these config options?

Furtermore, what is the size win on x86 defconfig with these options
set? allyesconfig has all sorts of crazy stuff enabled while defconfig
on x86 tries to track typical distro configs.

Thanks,

Ingo

2015-05-12 08:16:47

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

* Andrew Morton | 2015-05-11 15:19:13 [-0700]:

>Presumably Hagen didn't see the issue with spinlock functions. I
>wonder why not.

I think it is a compiler version thing. Not sure why I didn't see it.

>I suppose we should get both these consolidated into a coherent whole.

+1 (let wait for a moment and delay patch inclusion)

>It's a bit irritating to have to do this: presumably gcc will get fixed
>and the huge sprinkling of __always_inline will become less and less
>relevant over time and people will have trouble distinguishing "real
>__always_inline which was put here for a purpose" from "dopey
>__always_inline to work around a short-term gcc glitch".
>
>__always_inline is one of those things where a usage site should always
>be commented, because it's near impossible to work out why someone
>chose to use it. Quick, tell me what's happening in include/linux/slab.h.
>
>
>Perhaps we should do
>
>/*
> * Comment goes here. It is very specific about gcc versions.
> */
>#define inline_for_broken_gcc __always_inline

yeah, but name it in a compiler independent way. Sometimes we may seen similar
misbehaving with clang too. But see my other comments

#define inline_for_broken_cc __always_inline

>and then use inline_for_broken_gcc everywhere. That way, the reason
>for the marker is self-explanatory and we can later hunt all these
>things down and remvoe them.
>
>Also, the inline_for_broken_gcc definition can be made dependent on
>particular gcc versions, which will allow us to easily keep an eye on
>the behaviour of later gcc versions.

Mhh, I am not a big fan of this. I think we maneuver into a unmaintainable
area with this approach. We must test, check this for all compiler version,
new version, all kinds of compiler flags, etc pp.

Another Idea: we talk roundabout about 50 functions where inlining is mission
critical (and correct) but gcc sometimes have trouble to do so. Why not
enforce __always_inline there? E.g. annotate these rare function with
enforce_inline to highlight that these functions are always inlined. No matter
what optimization and what compiler flags:

#define enforce_inline __always_inline

Developers are encouraged to use inline - because then the compiler can decide
based on his algorithms/heuristics if a function should be inlined or not. For
some really hot & short function the developer can use enforce_inline
- but this should be an exception.

Hagen

2015-05-12 09:45:38

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On 05/12/2015 12:19 AM, Andrew Morton wrote:
> On Mon, 11 May 2015 19:57:22 +0200 Denys Vlasenko <[email protected]> wrote:
>
>> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
>> very small functions we expect to be inlined. In particular,
>> with this config: http://busybox.net/~vda/kernel_config
>> there are more than a thousand copies of tiny spinlock-related functions:
>>
>> $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
>> 473 000000000000000b t spin_unlock_irqrestore
>> 292 000000000000000b t spin_unlock
>> 215 000000000000000b t spin_lock
>> 134 000000000000000b t spin_unlock_irq
>> 130 000000000000000b t spin_unlock_bh
>> 120 000000000000000b t spin_lock_irq
>> 106 000000000000000b t spin_lock_bh
>>
>> Disassembly:
>>
>> ffffffff81004720 <spin_lock>:
>> ffffffff81004720: 55 push %rbp
>> ffffffff81004721: 48 89 e5 mov %rsp,%rbp
>> ffffffff81004724: e8 f8 4e e2 02 callq <_raw_spin_lock>
>> ffffffff81004729: 5d pop %rbp
>> ffffffff8100472a: c3 retq
>>
>> This patch fixes this via s/inline/__always_inline/ in spinlock.h.
>> This decreases vmlinux by about 30k:
>>
>> text data bss dec hex filename
>> 82375570 22255544 20627456 125258570 7774b4a vmlinux.before
>> 82335059 22255416 20627456 125217931 776ac8b vmlinux
>
> See also https://lkml.org/lkml/2015/4/23/598 ("enforce function
> inlining for hot functions").
>
> Presumably Hagen didn't see the issue with spinlock functions. I
> wonder why not.
>
> I suppose we should get both these consolidated into a coherent whole.
>
> It's a bit irritating to have to do this: presumably gcc will get fixed
> and the huge sprinkling of __always_inline will become less and less
> relevant over time and people will have trouble distinguishing "real
> __always_inline which was put here for a purpose" from "dopey
> __always_inline to work around a short-term gcc glitch".

In my patches, I put __always_inline *only* on functions
where my measurements show a large size decrease from doing so.
*Not* on functions where "I think it may be a good idea".

So far, all such functions were so trivial that inlining decision there
is a no-brainer.

> and then use inline_for_broken_gcc everywhere. That way, the reason
> for the marker is self-explanatory and we can later hunt all these
> things down and remvoe them.
>
> Also, the inline_for_broken_gcc definition can be made dependent on
> particular gcc versions, which will allow us to easily keep an eye on
> the behaviour of later gcc versions.

I've seen it on gcc-4.7.2 and gcc-4.9.2, so this behavior is not
limited to a narrow range of gcc versions. I'd say by now about half
of running kernels can easily be affected.

--
vda

2015-05-12 09:48:48

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops


* Denys Vlasenko <[email protected]> wrote:

> > Also, the inline_for_broken_gcc definition can be made dependent
> > on particular gcc versions, which will allow us to easily keep an
> > eye on the behaviour of later gcc versions.
>
> I've seen it on gcc-4.7.2 and gcc-4.9.2, so this behavior is not
> limited to a narrow range of gcc versions. I'd say by now about half
> of running kernels can easily be affected.

Please do the measurements on x86 defconfig (with OPTIMIZE_FOR_SIZE
and OPTIMIZE_INLINING enabled if necessary), to make sure we are truly
getting a decrease in kernel size on common distro configs as well.

Thanks,

Ingo

2015-05-12 11:03:07

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On 05/12/2015 09:44 AM, Ingo Molnar wrote:
>
> * Denys Vlasenko <[email protected]> wrote:
>
>> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
>> very small functions we expect to be inlined. In particular,
>> with this config: http://busybox.net/~vda/kernel_config
>> there are more than a thousand copies of tiny spinlock-related functions:
>
> That's an x86-64 allyesconfig AFAICS, right?

Close, but I disabled options which are clearly "heavy debugging" stuff.
IOW: many developers run their work machines with lock debugging etc,
but few would constantly use something which slows kernel down by a factor of 3!

So, CONFIG_KASAN is off. CONFIG_STAGING is also off. And a few others I forgot.

I'm using this config to see which inlines should be deinlined.
For that, I need to cover all callsites of each inline.
Thus, I need ~allyesconfig.

The discovery that there also exists the opposite problem (wrongly
*un*inlined functions) was accidental.


> It's not mysterious, but an effect of -Os plus allowing GCC to do
> inlining heuristics:
>
> CONFIG_CC_OPTIMIZE_FOR_SIZE=y
> CONFIG_OPTIMIZE_INLINING=y
>
> Does the problem go away if you unset of these config options?

With CONFIG_CC_OPTIMIZE_FOR_SIZE off,
problem greatly diminishes, but is not eliminated.
Testing allyesconfig would take too long, so I just took defconfig.

On defconfig kernel, the following functions below 16 bytes
of machine code are auto-deinlined:

#Calls_ Size(hex)_______ Name____________________
7 000000000000000b t hweight_long
5 000000000000000f t init_once
4 000000000000000d t cpumask_set_cpu
4 000000000000000b t udp_lib_close
4 0000000000000006 t udp_lib_hash
3 000000000000000a t nofill
3 0000000000000006 t sg_set_page.part.7
2 000000000000000f t udplite_sk_init
2 000000000000000f t ct_seq_next
2 000000000000000e t encode_cookie
2 000000000000000d t ktime_get_real
2 000000000000000b t spin_lock
2 000000000000000b t device_create_release
2 000000000000000b t cpu_smt_flags
2 000000000000000b t cpu_core_flags
2 0000000000000009 t default_write_file
2 0000000000000008 t __initcall_pl_driver_init6
2 0000000000000008 t __initcall_nf_defrag_init6
2 0000000000000008 t __initcall_hid_init6
2 0000000000000008 t __initcall_ch_driver_init6
2 0000000000000008 t default_read_file
2 0000000000000006 t wiphy_to_rdev.part.4
2 0000000000000006 t s_stop
2 0000000000000006 t sg_set_page.part.3
2 0000000000000006 t generic_print_tuple
2 0000000000000006 t exp_seq_stop
2 0000000000000006 t ct_seq_stop
2 0000000000000006 t ct_cpu_seq_stop

In particular, one of the functions from my patches,
spin_lock(), has been auto-deinlined:

ffffffff8108adb0 <spin_lock>:
ffffffff8108adb0: 55 push %rbp
ffffffff8108adb1: 48 89 e5 mov %rsp,%rbp
ffffffff8108adb4: e8 37 db 81 00 callq ffffffff818a88f0 <_raw_spin_lock>
ffffffff8108adb9: 5d pop %rbp
ffffffff8108adba: c3 retq


> Furtermore, what is the size win on x86 defconfig with these options
> set?

CONFIG_OPTIMIZE_INLINING=y is in defconfig.

Size difference for CC_OPTIMIZE_FOR_SIZE:

text data bss dec hex filename
12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n

Decrease by about 19%.

--
vda

2015-05-12 11:44:01

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops


* Denys Vlasenko <[email protected]> wrote:

> On 05/12/2015 09:44 AM, Ingo Molnar wrote:
> >
> > * Denys Vlasenko <[email protected]> wrote:
> >
> >> With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline
> >> very small functions we expect to be inlined. In particular,
> >> with this config: http://busybox.net/~vda/kernel_config
> >> there are more than a thousand copies of tiny spinlock-related functions:
> >
> > That's an x86-64 allyesconfig AFAICS, right?
>
> Close, but I disabled options which are clearly "heavy debugging" stuff.
> IOW: many developers run their work machines with lock debugging etc,
> but few would constantly use something which slows kernel down by a factor of 3!
>
> So, CONFIG_KASAN is off. CONFIG_STAGING is also off. And a few others I forgot.
>
> I'm using this config to see which inlines should be deinlined.
> For that, I need to cover all callsites of each inline.
> Thus, I need ~allyesconfig.
>
> The discovery that there also exists the opposite problem (wrongly
> *un*inlined functions) was accidental.
>
>
> > It's not mysterious, but an effect of -Os plus allowing GCC to do
> > inlining heuristics:
> >
> > CONFIG_CC_OPTIMIZE_FOR_SIZE=y
> > CONFIG_OPTIMIZE_INLINING=y
> >
> > Does the problem go away if you unset of these config options?
>
> With CONFIG_CC_OPTIMIZE_FOR_SIZE off,
> problem greatly diminishes, but is not eliminated.
> Testing allyesconfig would take too long, so I just took defconfig.
>
> On defconfig kernel, the following functions below 16 bytes
> of machine code are auto-deinlined:
>
> #Calls_ Size(hex)_______ Name____________________
> 7 000000000000000b t hweight_long
> 5 000000000000000f t init_once
> 4 000000000000000d t cpumask_set_cpu
> 4 000000000000000b t udp_lib_close
> 4 0000000000000006 t udp_lib_hash
> 3 000000000000000a t nofill
> 3 0000000000000006 t sg_set_page.part.7
> 2 000000000000000f t udplite_sk_init
> 2 000000000000000f t ct_seq_next
> 2 000000000000000e t encode_cookie
> 2 000000000000000d t ktime_get_real
> 2 000000000000000b t spin_lock
> 2 000000000000000b t device_create_release
> 2 000000000000000b t cpu_smt_flags
> 2 000000000000000b t cpu_core_flags
> 2 0000000000000009 t default_write_file
> 2 0000000000000008 t __initcall_pl_driver_init6
> 2 0000000000000008 t __initcall_nf_defrag_init6
> 2 0000000000000008 t __initcall_hid_init6
> 2 0000000000000008 t __initcall_ch_driver_init6
> 2 0000000000000008 t default_read_file
> 2 0000000000000006 t wiphy_to_rdev.part.4
> 2 0000000000000006 t s_stop
> 2 0000000000000006 t sg_set_page.part.3
> 2 0000000000000006 t generic_print_tuple
> 2 0000000000000006 t exp_seq_stop
> 2 0000000000000006 t ct_seq_stop
> 2 0000000000000006 t ct_cpu_seq_stop
>
> In particular, one of the functions from my patches,
> spin_lock(), has been auto-deinlined:
>
> ffffffff8108adb0 <spin_lock>:
> ffffffff8108adb0: 55 push %rbp
> ffffffff8108adb1: 48 89 e5 mov %rsp,%rbp
> ffffffff8108adb4: e8 37 db 81 00 callq ffffffff818a88f0 <_raw_spin_lock>
> ffffffff8108adb9: 5d pop %rbp
> ffffffff8108adba: c3 retq
>
>
> > Furtermore, what is the size win on x86 defconfig with these options
> > set?
>
> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
>
> Size difference for CC_OPTIMIZE_FOR_SIZE:
>
> text data bss dec hex filename
> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
>
> Decrease by about 19%.

I suspect the 'filename' field wants to be flipped?

In any case, the interesting measurement would not be -Os comparisons
(which causes GCC to be too crazy), but to see the size effect of your
_patch_ that always-inlines spinlock ops, on plain defconfig and on
defconfig-Os.

Thanks,

Ingo

2015-05-12 13:14:13

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On 05/12/2015 01:43 PM, Ingo Molnar wrote:
>>> Furtermore, what is the size win on x86 defconfig with these options
>>> set?
>>
>> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
>>
>> Size difference for CC_OPTIMIZE_FOR_SIZE:
>>
>> text data bss dec hex filename
>> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
>> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
>>
>> Decrease by about 19%.
>
> I suspect the 'filename' field wants to be flipped?

Yes.

> In any case, the interesting measurement would not be -Os comparisons
> (which causes GCC to be too crazy), but to see the size effect of your
> _patch_ that always-inlines spinlock ops, on plain defconfig and on
> defconfig-Os.

Here it is:

text data bss dec hex filename
12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
12335930 1746152 1081344 15163426 e76022 vmlinux

text data bss dec hex filename
10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
10363621 1684200 1077248 13125069 c845cd vmlinux

2015-05-13 10:17:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops


* Denys Vlasenko <[email protected]> wrote:

> On 05/12/2015 01:43 PM, Ingo Molnar wrote:
> >>> Furtermore, what is the size win on x86 defconfig with these options
> >>> set?
> >>
> >> CONFIG_OPTIMIZE_INLINING=y is in defconfig.
> >>
> >> Size difference for CC_OPTIMIZE_FOR_SIZE:
> >>
> >> text data bss dec hex filename
> >> 12335864 1746152 1081344 15163360 e75fe0 vmlinux.CC_OPTIMIZE_FOR_SIZE=y
> >> 10373764 1684200 1077248 13135212 c86d6c vmlinux.CC_OPTIMIZE_FOR_SIZE=n
> >>
> >> Decrease by about 19%.
> >
> > I suspect the 'filename' field wants to be flipped?
>
> Yes.
>
> > In any case, the interesting measurement would not be -Os comparisons
> > (which causes GCC to be too crazy), but to see the size effect of your
> > _patch_ that always-inlines spinlock ops, on plain defconfig and on
> > defconfig-Os.
>
> Here it is:
>
> text data bss dec hex filename
> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
> 12335930 1746152 1081344 15163426 e76022 vmlinux

Hm, that's a (small) size increase on O2.

That might be a net positive though: because now we've eliminated
quite a few function calls. Do we know which individual functions
bloat and which debloat?

> text data bss dec hex filename
> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
> 10363621 1684200 1077248 13125069 c845cd vmlinux

A decrease - which gets exploded on allyesconfig.

So as long as the -O2 case does not get hurt we can do -Os fixes.

I think this needs a bit more work to ensure that the O2 case is a net
win.

Thanks,

Ingo

2015-05-13 10:29:02

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On 05/13/2015 12:17 PM, Ingo Molnar wrote:
>>> In any case, the interesting measurement would not be -Os comparisons
>>> (which causes GCC to be too crazy), but to see the size effect of your
>>> _patch_ that always-inlines spinlock ops, on plain defconfig and on
>>> defconfig-Os.
>>
>> Here it is:
>>
>> text data bss dec hex filename
>> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
>> 12335930 1746152 1081344 15163426 e76022 vmlinux
>
> Hm, that's a (small) size increase on O2.
>
> That might be a net positive though: because now we've eliminated
> quite a few function calls. Do we know which individual functions
> bloat and which debloat?

>> text data bss dec hex filename
>> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
>> 10363621 1684200 1077248 13125069 c845cd vmlinux
>
> A decrease - which gets exploded on allyesconfig.
>
> So as long as the -O2 case does not get hurt we can do -Os fixes.
>
> I think this needs a bit more work to ensure that the O2 case is a net
> win.

I think O2 difference is just noise: with -O2 gcc is far less prone
to bogus deinlining, my patch should have negligible effect.
And effect is indeed negligible: +70 bytes on 12 megabytes.

2015-05-13 10:43:13

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops


* Denys Vlasenko <[email protected]> wrote:

> On 05/13/2015 12:17 PM, Ingo Molnar wrote:
> >>> In any case, the interesting measurement would not be -Os comparisons
> >>> (which causes GCC to be too crazy), but to see the size effect of your
> >>> _patch_ that always-inlines spinlock ops, on plain defconfig and on
> >>> defconfig-Os.
> >>
> >> Here it is:
> >>
> >> text data bss dec hex filename
> >> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
> >> 12335930 1746152 1081344 15163426 e76022 vmlinux
> >
> > Hm, that's a (small) size increase on O2.
> >
> > That might be a net positive though: because now we've eliminated
> > quite a few function calls. Do we know which individual functions
> > bloat and which debloat?
>
> >> text data bss dec hex filename
> >> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
> >> 10363621 1684200 1077248 13125069 c845cd vmlinux
> >
> > A decrease - which gets exploded on allyesconfig.
> >
> > So as long as the -O2 case does not get hurt we can do -Os fixes.
> >
> > I think this needs a bit more work to ensure that the O2 case is a
> > net win.
>
> I think O2 difference is just noise: with -O2 gcc is far less prone
> to bogus deinlining, my patch should have negligible effect. And
> effect is indeed negligible: +70 bytes on 12 megabytes.

So the patch force-inlines about a dozen locking APIs:

- Some of those decrease the defconfig kernel size.
Which ones and by how much?

- Some of those increase the defconfig kernel size.
Which ones and by how much?

We only know that the net effect is +70 bytes. Does that come out of:

- large fluctuations such as -1000-1000+1000+1070, which happens to
net out into a small net number?

- or does it come from much smaller fluctuations?

So to make an informed decision we need to know those details. When I
deinline or reinline functions I usually do it on a per function
basis, to avoid such ambiguity.

In the end what we want to have is only those deinlining/reinlining
changes that decrease the defconfig kernel size, or at worst only
increase it marginally.

Thanks,

Ingo

2015-05-13 14:10:10

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On 05/13/2015 12:43 PM, Ingo Molnar wrote:
>
> * Denys Vlasenko <[email protected]> wrote:
>
>> On 05/13/2015 12:17 PM, Ingo Molnar wrote:
>>>>> In any case, the interesting measurement would not be -Os comparisons
>>>>> (which causes GCC to be too crazy), but to see the size effect of your
>>>>> _patch_ that always-inlines spinlock ops, on plain defconfig and on
>>>>> defconfig-Os.
>>>>
>>>> Here it is:
>>>>
>>>> text data bss dec hex filename
>>>> 12335864 1746152 1081344 15163360 e75fe0 vmlinuxO2.before
>>>> 12335930 1746152 1081344 15163426 e76022 vmlinux
>>>
>>> Hm, that's a (small) size increase on O2.
>>>
>>> That might be a net positive though: because now we've eliminated
>>> quite a few function calls. Do we know which individual functions
>>> bloat and which debloat?
>>
>>>> text data bss dec hex filename
>>>> 10373764 1684200 1077248 13135212 c86d6c vmlinuxOs.before
>>>> 10363621 1684200 1077248 13125069 c845cd vmlinux
>>>
>>> A decrease - which gets exploded on allyesconfig.
>>>
>>> So as long as the -O2 case does not get hurt we can do -Os fixes.
>>>
>>> I think this needs a bit more work to ensure that the O2 case is a
>>> net win.
>>
>> I think O2 difference is just noise: with -O2 gcc is far less prone
>> to bogus deinlining, my patch should have negligible effect. And
>> effect is indeed negligible: +70 bytes on 12 megabytes.
>
> So the patch force-inlines about a dozen locking APIs:
>
> - Some of those decrease the defconfig kernel size.
> Which ones and by how much?
>
> - Some of those increase the defconfig kernel size.
> Which ones and by how much?
>
> We only know that the net effect is +70 bytes. Does that come out of:
>
> - large fluctuations such as -1000-1000+1000+1070, which happens to
> net out into a small net number?
>
> - or does it come from much smaller fluctuations?
>
> So to make an informed decision we need to know those details.

Fair enough. Let's investigate.

I produced a list of functions with their sizes from each vmlinux,
and diffed them:

$ nm --size-sort vmlinux | sed 's/\.[0-9]*.*/.NNN/' >vmlinux.nm
$ nm --size-sort vmlinuxO2.before | sed 's/\.[0-9]*.*/.NNN/' >vmlinuxO2.before.nm
$ diff -u vmlinuxO2.before.nm vmlinux.nm | grep -v '^[ @]' >vmlinux.nm.dif

I see the following:

- spin_[un]lock_foo's are gone as expected.
- some other functions got spuriously deinlined
(such as __raw_spin_unlock).
- yet other functions which were spuriously deinlined before,
now aren't deinlined (such as nf_conntrack_put).

--- vmlinuxO2.before.nm 2015-05-13 15:46:37.147058665 +0200
+++ vmlinux.nm 2015-05-13 15:46:26.079086233 +0200
+0000000000000009 t ipc_unlock_object
+0000000000000009 t __raw_spin_unlock
+0000000000000009 t __raw_spin_unlock
-0000000000000009 t spin_unlock
-000000000000000b t spin_lock
-000000000000000b t spin_lock
-000000000000000b t spin_unlock_irqrestore
+000000000000000d t task_unlock
+0000000000000011 t arch_spin_is_locked
+0000000000000013 t double_unlock_hb
-000000000000001c t nf_conntrack_put
-000000000000001d t ctnetlink_done_list
+000000000000001e t unix_state_double_unlock
+0000000000000025 t check_and_drop
-0000000000000025 t hugetlbfs_inc_free_inodes.NNN
-0000000000000025 t sem_lock.NNN
-0000000000000027 t check_and_drop

- many other functions now have slightly different sizes.
For example:
ext4_mb_pa_callback grew from 0x28 to 0x2a bytes (+2 bytes)
hid_alloc_report_buf shrank from 0x2a to 0x28 bytes (-2 bytes)

-0000000000000028 t ext4_mb_pa_callback
+0000000000000028 T hid_alloc_report_buf
+0000000000000028 t nf_conntrack_double_unlock
+0000000000000029 t do_shm_rmid
+000000000000002a t ext4_mb_pa_callback
-000000000000002a T hid_alloc_report_buf

Let's take a look at ext4_mb_pa_callback.
The difference stems from gcc choosing a different scratch register:

<ext4_mb_pa_callback>: <ext4_mb_pa_callback>:
8b 47 14 mov 0x14(%rdi),%eax 8b 47 14 mov 0x14(%rdi),%eax
48 8d 77 e0 lea -0x20(%rdi),%rsi 48 8d 77 e0 lea -0x20(%rdi),%rsi
85 c0 test %eax,%eax 85 c0 test %eax,%eax
75 1b jne <ext4_mb_pa_callback+0x26> 75 19 jne <ext4_mb_pa_callback+0x24>
44 8b 57 18 mov 0x18(%rdi),%r10d 8b 7f 18 mov 0x18(%rdi),%edi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
45 85 d2 test %r10d,%r10d 85 ff test %edi,%edi
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
74 14 je <ext4_mb_pa_callback+0x28> 74 14 je <ext4_mb_pa_callback+0x26>
55 push %rbp 55 push %rbp
48 8b 3d e4 b8 eb 00 mov 0xebb8e4(%rip),%rdi 48 8b 3d 96 c0 eb 00 mov 0xebc096(%rip),%rdi
48 89 e5 mov %rsp,%rbp 48 89 e5 mov %rsp,%rbp
e8 8c 3f f4 ff callq <kmem_cache_free> e8 fe 46 f4 ff callq <kmem_cache_free>
5d pop %rbp 5d pop %rbp
c3 retq c3 retq
0f 0b ud2 0f 0b ud2
0f 0b ud2 0f 0b ud2

Working with r10 requires REX prefix in two underlined instructions,
which caused function to grow by 2 bytes.

This is not in any way related to my patch.

I think this confirms my hypothesis that the difference of +70 bytes is noise.

The remainder of the diff is below:

-000000000000002b t do_shm_rmid
+000000000000002b T jbd2_journal_ack_err
-000000000000002c t ctnetlink_done
+000000000000002d t ctnetlink_done_list
-000000000000002d T jbd2_journal_ack_err
-000000000000002e t amd_iommu_stats_add
-000000000000002e t hid_parser_reserved
+0000000000000030 t amd_iommu_stats_add
+0000000000000030 t hid_parser_reserved
-0000000000000034 T acpi_ec_unblock_transactions
+0000000000000034 T usb_hub_to_struct_hub
-0000000000000036 t acpi_ec_stopped
+0000000000000036 T acpi_ec_unblock_transactions
-0000000000000037 T completion_done
-000000000000003c T usb_hub_to_struct_hub
+000000000000003e T completion_done
-000000000000003e t ehci_clear_tt_buffer.NNN
+000000000000003f t acpi_ec_gpe_handler
+000000000000003f t ehci_clear_tt_buffer.NNN
-0000000000000041 t acpi_ec_gpe_handler
+0000000000000042 T usb_wakeup_notification
-0000000000000044 T acpi_boot_ec_enable
-0000000000000044 T usb_wakeup_notification
+0000000000000045 T acpi_boot_ec_enable
-0000000000000046 t hugetlbfs_destroy_inode
+0000000000000047 t ctnetlink_done
+0000000000000049 T efivar_entry_iter
+000000000000004a t turn_on_io_watchdog
-000000000000004b T efivar_entry_iter
-000000000000004c t init_fat_fs
-000000000000004c t turn_on_io_watchdog
+000000000000004e t init_fat_fs
+0000000000000054 T shm_destroy_orphaned
-0000000000000055 t sem_wait_array
+0000000000000056 t dm_dirty_log_init
-0000000000000056 T shm_destroy_orphaned
-0000000000000057 t dm_dirty_log_init
+0000000000000058 t fat_evict_inode
+0000000000000058 t mirror_available
-0000000000000059 t mirror_available
-000000000000005a t fat_evict_inode
+000000000000005b t sem_wait_array
-000000000000005c T blk_pre_runtime_suspend
+000000000000005c t free_dev_data
-000000000000005d t free_dev_data
+000000000000005f t hugetlbfs_destroy_inode
+0000000000000064 T blk_pre_runtime_suspend
+0000000000000071 t metadata_show
+0000000000000071 t update_changeattr
-0000000000000072 t update_changeattr
-0000000000000073 t hugetlbfs_alloc_inode
-0000000000000076 t fat_calc_dir_size
+0000000000000079 t fat_calc_dir_size
-0000000000000079 t metadata_show
+000000000000007c t rtc_dev_open
+000000000000007d t hugetlbfs_alloc_inode
-000000000000007e t ptrace_unfreeze_traced.NNN
-000000000000007f t rtc_dev_open
+0000000000000080 t ptrace_unfreeze_traced.NNN
-0000000000000083 T vfs_whiteout
+0000000000000086 T vfs_whiteout
-0000000000000087 t read_disk_sb.NNN
+0000000000000089 T assert_forcewakes_inactive
+000000000000008a t read_disk_sb.NNN
-000000000000008b T assert_forcewakes_inactive
-000000000000008e t cache_ioctl.NNN
+0000000000000093 T usbhid_close
-0000000000000094 t nf_nat_init
+0000000000000095 t nf_nat_init
+0000000000000096 t cache_ioctl.NNN
+0000000000000096 t __lock_sock
-0000000000000096 T usbhid_close
-0000000000000097 t __lock_sock
+0000000000000098 T jbd2_trans_will_send_data_barrier
-000000000000009a T jbd2_trans_will_send_data_barrier
+000000000000009b T drm_crtc_vblank_reset
-000000000000009d T drm_crtc_vblank_reset
+00000000000000a6 T md_setup_cluster
+00000000000000aa t hugetlbfs_statfs
+00000000000000ab t uhci_hcd_init
-00000000000000ac t hugetlbfs_statfs
-00000000000000ac t uhci_hcd_init
+00000000000000b1 t gro_cell_poll
-00000000000000b2 t gro_cell_poll
+00000000000000b2 t nf_conntrack_double_lock
-00000000000000b4 T md_setup_cluster
-00000000000000b4 t pps_init
+00000000000000b5 t pps_init
-00000000000000bb T svc_authenticate
+00000000000000bd T svc_authenticate
-00000000000000be t ehci_poll_ASS
+00000000000000c1 t ehci_poll_PSS
+00000000000000c1 t loop_queue_write_work
+00000000000000c5 T usb_disable_lpm
+00000000000000c6 t ehci_poll_ASS
-00000000000000c7 t nf_conntrack_double_lock
-00000000000000c7 T usb_disable_lpm
-00000000000000c9 t ehci_poll_PSS
+00000000000000cb t xfrm_del_sa
+00000000000000cd t snd_timer_user_ccallback
-00000000000000d0 t xfrm_del_sa
+00000000000000d2 T d_prune_aliases
-00000000000000d4 T usb_sg_cancel
-00000000000000d5 t pci_pm_freeze_noirq
-00000000000000d5 t snd_timer_user_ccallback
+00000000000000d5 T usb_sg_cancel
+00000000000000d7 t pci_pm_freeze_noirq
-00000000000000d9 T d_prune_aliases
-00000000000000db t loop_queue_write_work
-00000000000000e9 t dentry_lru_isolate
+00000000000000ed t dentry_lru_isolate
+00000000000000f6 t max_sync_store
+00000000000000fa t slab_sysfs_init
+00000000000000fc T fat_attach
-00000000000000fc t slab_sysfs_init
-00000000000000fe T fat_attach
-00000000000000fe t max_sync_store
-00000000000000ff t slab_out_of_memory
+0000000000000100 t slab_out_of_memory
+0000000000000100 t tg3_nway_reset
-0000000000000102 t ctnetlink_dump_exp_ct
+0000000000000103 t calgary_fixup_tce_spaces
+0000000000000103 t ext4_inode_csum_set
-0000000000000103 t tg3_nway_reset
-0000000000000104 t calgary_fixup_tce_spaces
-0000000000000105 t __unmap_single.NNN
-0000000000000106 t acpi_ec_stop
-0000000000000107 t pktsched_init
+0000000000000107 t __unmap_single.NNN
+0000000000000108 t pktsched_init
-000000000000010b t ext4_inode_csum_set
+000000000000010f T drm_vblank_on
+0000000000000113 t ctnetlink_dump_exp_ct
-0000000000000115 t mddev_put
+0000000000000117 t nl80211_set_mac_acl
-0000000000000119 T drm_vblank_on
+000000000000011a t mddev_put
+000000000000011b T wait_for_completion_interruptible_timeout
-000000000000011c T wait_for_completion_interruptible_timeout
+0000000000000123 t acpi_ec_stop
+0000000000000125 t ext4_mb_simple_scan_group
-0000000000000125 t nfs_do_filldir
+0000000000000125 T remove_proc_subtree
-0000000000000127 t nl80211_set_mac_acl
+000000000000012b t azx_irq_pending_work
+000000000000012d t nfs_do_filldir
-000000000000012e T remove_proc_subtree
+0000000000000131 t store_uframe_periodic_max
-0000000000000133 t store_uframe_periodic_max
-0000000000000137 t ext4_mb_simple_scan_group
-0000000000000139 T jbd2_journal_put_journal_head
-000000000000013b t azx_irq_pending_work
+000000000000013b T jbd2_journal_put_journal_head
+000000000000013d t xfrm_add_policy
+0000000000000142 t serial8250_backup_timeout
-0000000000000145 t xfrm_add_policy
+000000000000014a T hid_dump_device
-000000000000014c t alloc_buddy_huge_page
+0000000000000151 t finish_urb
+0000000000000151 t submit_flushes
-0000000000000152 T hid_dump_device
-0000000000000152 t serial8250_backup_timeout
+0000000000000154 t alloc_buddy_huge_page
-0000000000000159 t submit_flushes
+000000000000015a T vfs_mknod
+000000000000015c T remove_proc_entry
-000000000000015e T remove_proc_entry
-0000000000000161 t finish_urb
+0000000000000162 t nv_remove
-0000000000000162 T vfs_mknod
+0000000000000165 T nf_ct_delete
-0000000000000166 t nv_nic_irq_rx
+0000000000000167 t nv_nic_irq_rx
-000000000000016a t nv_remove
-0000000000000174 T iommu_tbl_pool_init
-0000000000000175 t queue_process
+000000000000017c T iommu_tbl_pool_init
+0000000000000180 t queue_process
-0000000000000183 t ctnetlink_del_conntrack
-0000000000000185 T nf_ct_delete
-000000000000019f t ctnetlink_new_conntrack
-00000000000001aa t azx_attach_pcm_stream
-00000000000001aa t dump_header.NNN
+00000000000001ab t azx_attach_pcm_stream
+00000000000001ab t dump_header.NNN
+00000000000001ae t acpi_ec_add
-00000000000001b0 t acpi_ec_add
+00000000000001b2 t ctnetlink_del_conntrack
-00000000000001b8 t nv_close
+00000000000001c9 t ctnetlink_new_conntrack
-00000000000001d1 t ctnetlink_create_expect.NNN
+00000000000001dc t nv_close
+00000000000001e1 t ctnetlink_create_expect.NNN
-00000000000001e3 t ctnetlink_get_conntrack
-00000000000001f0 t ctnetlink_dump_table
+00000000000001f0 t hid_debug_rdesc_show
-00000000000001f8 t hid_debug_rdesc_show
-00000000000001fa T ata_task_ioctl
-00000000000001fa T dm_kcopyd_copy
+00000000000001fb t ctnetlink_dump_table
+00000000000001fc T dm_kcopyd_copy
+0000000000000202 T ata_task_ioctl
-0000000000000207 T usb_reset_device
-000000000000020c T inet_bind
+000000000000020f T usb_reset_device
+0000000000000212 t ctnetlink_get_conntrack
-000000000000021a t nl80211_send_mlme_event.NNN
+000000000000021c T inet_bind
+000000000000021c t nl80211_send_mlme_event.NNN
+0000000000000226 t nv_nic_irq_other
-000000000000022e T destroy_workqueue
-000000000000022e t nv_nic_irq_other
+0000000000000230 T destroy_workqueue
-0000000000000233 T netpoll_send_skb_on_dev
+0000000000000242 t snd_timer_user_read
-0000000000000245 t snd_timer_user_read
-000000000000024a t univ8250_setup_irq
+000000000000024b T netpoll_send_skb_on_dev
-000000000000024c t ctnetlink_dump_list
+000000000000024f t ctnetlink_dump_list
+0000000000000252 t univ8250_setup_irq
+0000000000000253 t unix_dgram_connect
-000000000000026f t ieee80211_tx_h_select_key
-0000000000000275 t remove_and_add_spares
+0000000000000277 t ieee80211_tx_h_select_key
+0000000000000277 t remove_and_add_spares
+0000000000000278 t slot_store
-0000000000000280 t slot_store
-0000000000000286 t d_walk
-0000000000000288 T sys_shmctl
-0000000000000288 T SyS_shmctl
-0000000000000291 t unix_dgram_connect
+00000000000002a3 t prepend_path
+00000000000002a4 t d_walk
+00000000000002a4 T sys_shmctl
+00000000000002a4 T SyS_shmctl
+00000000000002b6 t ext4_mb_find_by_goal
+00000000000002b6 T nf_conntrack_hash_check_insert
-00000000000002b7 t prepend_path
+00000000000002bf t nl80211_set_reg
-00000000000002c6 t ext4_mb_find_by_goal
+00000000000002c7 T __nf_conntrack_confirm
-00000000000002cf t nl80211_set_reg
-00000000000002f6 T nf_conntrack_hash_check_insert
-00000000000002ff T __nf_conntrack_confirm
-0000000000000308 T detect_calgary
+0000000000000309 T detect_calgary
-0000000000000326 t hiddev_read
-0000000000000328 t ext4_mb_generate_buddy
+0000000000000328 t hiddev_read
+0000000000000330 t ext4_mb_generate_buddy
-0000000000000345 t alps_process_touchpad_packet_v3_v5
-0000000000000346 t bsg_map_hdr.NNN
+0000000000000347 t bsg_map_hdr.NNN
+000000000000034c t nfs_end_delegation_return
+000000000000034d t alps_process_touchpad_packet_v3_v5
-000000000000035d t nfs_end_delegation_return
-0000000000000372 T oom_kill_process
+0000000000000373 T oom_kill_process
+0000000000000379 t acpi_ec_transaction
-0000000000000394 t acpi_ec_transaction
+00000000000003a3 t chv_read16
-00000000000003ab t chv_read16
-00000000000003d9 t hiddev_ioctl_usage.NNN
+00000000000003e9 t do_timerfd_settime
+00000000000003e9 t hiddev_ioctl_usage.NNN
-00000000000003eb t do_timerfd_settime
-0000000000000405 T ohci_hub_status_data
-0000000000000406 t chv_write16
-0000000000000406 t super_90_load
+0000000000000407 t super_90_load
-000000000000040b t loop_clr_fd
+000000000000040e t chv_write16
+0000000000000415 T ohci_hub_status_data
+000000000000041b t loop_clr_fd
+0000000000000425 t worker_thread
-0000000000000428 t worker_thread
+0000000000000444 T azx_init_chip
-000000000000044c T azx_init_chip
-0000000000000453 T ext4_discard_preallocations
-0000000000000475 T do_shmat
+000000000000047a T do_shmat
+000000000000047b T ext4_discard_preallocations
-0000000000000488 t ext4_direct_IO
+0000000000000498 t ext4_direct_IO
+00000000000004f9 t ext4_mb_normalize_request
-0000000000000509 t ext4_mb_normalize_request
-000000000000050b t __dev_queue_xmit
+0000000000000511 t __dev_queue_xmit
-00000000000005d6 T serial8250_do_startup
+00000000000005e6 T serial8250_do_startup
-000000000000061c t nv_update_linkspeed
+000000000000061b t nv_napi_poll
-0000000000000625 t nv_napi_poll
+000000000000062c t nv_update_linkspeed
-00000000000007e8 t nv_start_xmit_optimized
+00000000000007ea t nv_start_xmit_optimized
-0000000000000893 t futex_requeue
+0000000000000891 t futex_requeue
+0000000000000898 t nl80211_parse_sched_scan.NNN
-00000000000008a0 t nl80211_parse_sched_scan.NNN
-000000000000094c T sock_setsockopt
+0000000000000950 T sock_setsockopt
-00000000000009ee T do_futex
+00000000000009fe T do_futex
-0000000000000a66 t nv_self_test
+0000000000000aaa t nv_self_test
-0000000000000af3 T sata_pmp_error_handler
+0000000000000afb T sata_pmp_error_handler
-0000000000000b5d t ohci_urb_enqueue
+0000000000000b65 t ohci_urb_enqueue
-0000000000000b72 t display_crc_ctl_write
+0000000000000b79 t display_crc_ctl_write
-0000000000000ca7 t SYSC_semtimedop
+0000000000000c9b t SYSC_semtimedop
+0000000000000e31 T md_do_sync
-0000000000000e3f T md_do_sync
+0000000000000fd3 t ehci_urb_enqueue
-0000000000000ffb t ehci_urb_enqueue
-0000000000001027 t packet_sendmsg
+0000000000001029 t packet_sendmsg
-0000000000001e17 t do_blockdev_direct_IO
+0000000000001e07 t do_blockdev_direct_IO
-000000000000261e t nl80211_send_wiphy
+000000000000262e t nl80211_send_wiphy

2015-05-15 07:20:53

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH] force inlining of spinlock ops

On Wed, May 13, 2015 at 04:09:18PM +0200, Denys Vlasenko wrote:
> On 05/13/2015 12:43 PM, Ingo Molnar wrote:
> > We only know that the net effect is +70 bytes. Does that come out of:
> >
> > - large fluctuations such as -1000-1000+1000+1070, which happens to
> > net out into a small net number?
> >
> > - or does it come from much smaller fluctuations?
> >
> > So to make an informed decision we need to know those details.
>
> Fair enough. Let's investigate.
>
> I produced a list of functions with their sizes from each vmlinux,
> and diffed them:
>
> $ nm --size-sort vmlinux | sed 's/\.[0-9]*.*/.NNN/' >vmlinux.nm
> $ nm --size-sort vmlinuxO2.before | sed 's/\.[0-9]*.*/.NNN/' >vmlinuxO2.before.nm
> $ diff -u vmlinuxO2.before.nm vmlinux.nm | grep -v '^[ @]' >vmlinux.nm.dif

FWIW, scripts/bloat-o-meter is a nice tool to examine the size differences
of two vmlinux images.