2015-04-20 21:27:30

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: [PATCH RFC] x86: enforce inlining for atomics

During some code analysis I realized that atomic_add, atomic_sub and
friends are not necessarily inlined AND that each function is defined
multiple times:

atomic_inc: 544 duplicates
atomic_dec: 215 duplicates
atomic_dec_and_test: 107 duplicates
atomic64_inc: 38 duplicates
[...]

Each definition is exact equally, e.g.:

ffffffff813171b8 <atomic_add>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 01 3e lock add %edi,(%rsi)
5d pop %rbp
c3 retq

In turn each definition has one or more callsites (sure):

ffffffff81317c78: e8 3b f5 ff ff callq ffffffff813171b8 <atomic_add>
[...]
ffffffff8131a062: e8 51 d1 ff ff callq ffffffff813171b8 <atomic_add>
[...]
ffffffff8131a190: e8 23 d0 ff ff callq ffffffff813171b8 <atomic_add>
[...]

The other way around would be to remove the static linkage - but I prefer
an enforced inlining here.

Before:
text data bss dec hex filename
81467393 19874720 20168704 121510817 73e1ba1 vmlinux.orig

After:
text data bss dec hex filename
81461323 19874720 20168704 121504747 73e03eb vmlinux.inlined

Yes, the inlining here makes the kernel even smaller! ;)

Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Hagen Paul Pfeifer <[email protected]>
---
arch/x86/include/asm/atomic.h | 16 ++++++++--------
arch/x86/include/asm/atomic64_64.h | 8 ++++----
2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 5e5cd12..75a9ee8 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
*
* Atomically adds @i to @v.
*/
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "addl %1,%0"
: "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "subl %1,%0"
: "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
}
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0"
: "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
}
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
{
return i + xadd(&v->counter, i);
}
@@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
* Atomically adds @a to @v, so long as @v was not already @u.
* Returns the old value of @v.
*/
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
int c, old;
c = atomic_read(v);
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index f8d273e..b965f9e 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
*
* Atomically adds @i to @v.
*/
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "addq %1,%0"
: "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
{
return i + xadd(&v->counter, i);
}
--
2.1.4


2015-04-20 21:56:54

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Mon, Apr 20, 2015 at 11:27:11PM +0200, Hagen Paul Pfeifer wrote:
> During some code analysis I realized that atomic_add, atomic_sub and
> friends are not necessarily inlined AND that each function is defined
> multiple times:
>
> atomic_inc: 544 duplicates
> atomic_dec: 215 duplicates
> atomic_dec_and_test: 107 duplicates
> atomic64_inc: 38 duplicates
> [...]
>
> Each definition is exact equally, e.g.:
>
> ffffffff813171b8 <atomic_add>:
> 55 push %rbp
> 48 89 e5 mov %rsp,%rbp
> f0 01 3e lock add %edi,(%rsi)
> 5d pop %rbp
> c3 retq
>
> In turn each definition has one or more callsites (sure):
>
> ffffffff81317c78: e8 3b f5 ff ff callq ffffffff813171b8 <atomic_add>
> [...]
> ffffffff8131a062: e8 51 d1 ff ff callq ffffffff813171b8 <atomic_add>
> [...]
> ffffffff8131a190: e8 23 d0 ff ff callq ffffffff813171b8 <atomic_add>
> [...]

Hmm, that must be config-specific as doing

objdump -D vmlinux | grep -i "atomic_add"

here gives me only "drm_atomic_add_affected_connectors" matches.

It probably gets inlined here always...

Other than that, this patch should actually even show some speedup as
we're getting rid of the stack preparation and function call overhead.
Have you done any benchmarks with it?

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

2015-04-20 22:08:50

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On 20 April 2015 at 23:56, Borislav Petkov <[email protected]> wrote:

> Hmm, that must be config-specific as doing
>
> objdump -D vmlinux | grep -i "atomic_add"
>
> here gives me only "drm_atomic_add_affected_connectors" matches.
>
> It probably gets inlined here always...

Probably, the config is allyesconfig minus trace/kernel adress
sanitizer and gcov related options.

> Other than that, this patch should actually even show some speedup as
> we're getting rid of the stack preparation and function call overhead.
> Have you done any benchmarks with it?

No, but I am sure that a benchmark should show at least theoretically
better perf numbers. But the numbers should be the measurement
uncertainty range. The avoided cache line miss should be the greatest
perf effect.

Hagen

2015-04-20 23:46:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Mon, Apr 20, 2015 at 11:27:11PM +0200, Hagen Paul Pfeifer wrote:
> During some code analysis I realized that atomic_add, atomic_sub and
> friends are not necessarily inlined AND that each function is defined
> multiple times:
>
> atomic_inc: 544 duplicates
> atomic_dec: 215 duplicates
> atomic_dec_and_test: 107 duplicates
> atomic64_inc: 38 duplicates
> [...]
>
> Each definition is exact equally, e.g.:
>
> ffffffff813171b8 <atomic_add>:
> 55 push %rbp
> 48 89 e5 mov %rsp,%rbp
> f0 01 3e lock add %edi,(%rsi)
> 5d pop %rbp
> c3 retq
>

Urgh, that's a GCC fail, what version and compile flags?

2015-04-21 07:42:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics


* Hagen Paul Pfeifer <[email protected]> wrote:

> On 20 April 2015 at 23:56, Borislav Petkov <[email protected]> wrote:
>
> > Hmm, that must be config-specific as doing
> >
> > objdump -D vmlinux | grep -i "atomic_add"
> >
> > here gives me only "drm_atomic_add_affected_connectors" matches.
> >
> > It probably gets inlined here always...
>
> Probably, the config is allyesconfig minus trace/kernel adress
> sanitizer and gcov related options.

So the thing is that allyesconfig turns on -Os:

CONFIG_CC_OPTIMIZE_FOR_SIZE=y

which is known to make bad decisions in other areas as well ...

If -Os does such bad inlining decisions (and the inlining examples you
cited are horrible!) then I guess a lot of the other 'inline'
functions are handled by it badly as well.

I'm not sure we should start fighting the compiler: if a compiler does
not take 'inline' seriously then the solution is to use another
compiler, or at least to use different compiler flags.

If inlining decisions are bad even with saner compiler options then we
can use __always_inline, and we actually do that for locking
primitives and some other low level primitives: which are typically
larger than atomics, so even reasonable compilers might uninline them.

But I'm not against your patch either. Linus, what's your preference?

Thanks,

Ingo

2015-04-21 10:56:57

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On 21 April 2015 at 09:42, Ingo Molnar <[email protected]> wrote:

Hey Ingo, Peter, Boris,

the environment is Debian with gcc version 4.9.2. I tried to reproduce
the results under Ubuntu 14.04 but their version (4.8.2!) seems fine:
at max one duplicate for each atomic_* function.

> So the thing is that allyesconfig turns on -Os:
>
> CONFIG_CC_OPTIMIZE_FOR_SIZE=y
>
> which is known to make bad decisions in other areas as well ...

I can recompile with "CONFIG_CC_OPTIMIZE_FOR_SIZE=n" and check the
results again!?

> If -Os does such bad inlining decisions (and the inlining examples you
> cited are horrible!) then I guess a lot of the other 'inline'
> functions are handled by it badly as well.

Assumption is correct - I see duplicates all over the place.

> I'm not sure we should start fighting the compiler: if a compiler does
> not take 'inline' seriously then the solution is to use another
> compiler, or at least to use different compiler flags.
>
> If inlining decisions are bad even with saner compiler options then we
> can use __always_inline, and we actually do that for locking
> primitives and some other low level primitives: which are typically
> larger than atomics, so even reasonable compilers might uninline them.

Probably we should check this on a wider gcc front! If the behavior is an
exception, or is this common behavior?

Hagen

2015-04-21 22:57:30

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

* Ingo Molnar | 2015-04-21 09:42:12 [+0200]:

Hey Ingo,

>So the thing is that allyesconfig turns on -Os:
>
> CONFIG_CC_OPTIMIZE_FOR_SIZE=y

CONFIG_CC_OPTIMIZE_FOR_SIZE seems to have no effect, The only option which
makes a difference is CONFIG_OPTIMIZE_INLINING! But this is not a big surprise:
*disabling* CONFIG_OPTIMIZE_INLINING substitudes _all_ inlines with
__attribute__((always_inline)).

"If unsure, say N." -> results in configurations with always_inline.


So I tested again, one time with unset CONFIG_OPTIMIZE_INLINING the result
seems fine:

show_temp: 59 duplicates
char2uni: 52 duplicates
uni2char: 52 duplicates
sd_probe: 49 duplicates
sd_driver_init: 48 duplicates
sd_driver_exit: 48 duplicates
usb_serial_module_exit: 47 duplicates
[...]


We see ordinary "template" reuse of common driver code without renaming the
copied static's. But compiled with CONFIG_OPTIMIZE_INLINING=y the inlining is
not respected by gcc:

atomic_inc: 544 duplicates
rcu_read_unlock: 453 duplicates
rcu_read_lock: 383 duplicates
get_dma_ops: 271 duplicates
arch_local_irq_restore: 258 duplicates
atomic_dec: 215 duplicates
kzalloc: 185 duplicates
test_and_set_bit: 156 duplicates
cpumask_check: 148 duplicates
cpumask_next: 146 duplicates
list_del: 131 duplicates
kref_get: 126 duplicates
test_and_clear_bit: 122 duplicates
brelse: 122 duplicates
schedule_work: 122 duplicates
netif_tx_stop_queue: 115 duplicates
atomic_dec_and_test: 107 duplicates
dma_mapping_error: 105 duplicates
list_del_init: 101 duplicates
netif_stop_queue: 100 duplicates
arch_local_save_flags: 98 duplicates
tasklet_schedule: 76 duplicates
clk_prepare_enable: 71 duplicates
init_completion: 69 duplicates
pskb_may_pull: 67 duplicates
[...]

Again, the used gcc version is "gcc (Debian 4.9.2-10) 4.9.2". So it is not
outdated nor a legacy one. The inline heuristic seems really broken for some
parts. Is it possible that gcc is bedeviled because of inline assembler
parts which brings confuse the internal scoring system?

I suggest the following: I prepare a patch series for the most obvious
candidates and substituting inline with __always_inline (probably ~50
functions). Each subsystem maintainer can check and ACK the patch. This has the
benefit that for all other locations gcc is still responsible for inlining
decision. Enforcing inlining via __always_inline for all inline marked function
is probably too hard!? In 2015 gcc is still not able to inline single line
statements - that's strange.

Linus, ack?

Hagen

2015-04-22 00:57:06

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Tue, Apr 21, 2015 at 3:57 PM, Hagen Paul Pfeifer <[email protected]> wrote:
>
> Is it possible that gcc is bedeviled because of inline assembler
> parts which brings confuse the internal scoring system?

yes, I have this memory of having seen that before - the size
heuristics for gcc getting confused by inlining. I'm wondering if
maybe it uses the size of the string to approximate the size, or just
uses some random variable.

It might be a good idea to mark things that are basically just
wrappers around a single (or a couple of) asm instruction to be
always_inline.

Linus

2015-04-22 05:24:46

by Markus Trippelsdorf

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On 2015.04.22 at 00:57 +0200, Hagen Paul Pfeifer wrote:
> We see ordinary "template" reuse of common driver code without renaming the
> copied static's. But compiled with CONFIG_OPTIMIZE_INLINING=y the inlining is
> not respected by gcc:
>
> atomic_inc: 544 duplicates
> rcu_read_unlock: 453 duplicates
> rcu_read_lock: 383 duplicates
> get_dma_ops: 271 duplicates
> arch_local_irq_restore: 258 duplicates
> atomic_dec: 215 duplicates
> kzalloc: 185 duplicates
> test_and_set_bit: 156 duplicates
> cpumask_check: 148 duplicates
> cpumask_next: 146 duplicates
> list_del: 131 duplicates
> kref_get: 126 duplicates
> test_and_clear_bit: 122 duplicates
> brelse: 122 duplicates
> schedule_work: 122 duplicates
> netif_tx_stop_queue: 115 duplicates
> atomic_dec_and_test: 107 duplicates
> dma_mapping_error: 105 duplicates
> list_del_init: 101 duplicates
> netif_stop_queue: 100 duplicates
> arch_local_save_flags: 98 duplicates
> tasklet_schedule: 76 duplicates
> clk_prepare_enable: 71 duplicates
> init_completion: 69 duplicates
> pskb_may_pull: 67 duplicates
> [...]
>
> Again, the used gcc version is "gcc (Debian 4.9.2-10) 4.9.2". So it is not
> outdated nor a legacy one. The inline heuristic seems really broken for some
> parts. Is it possible that gcc is bedeviled because of inline assembler
> parts which brings confuse the internal scoring system?

I cannot reproduce this issue with my config with 4.8, 4.9 or 5. Could
you please come up with a small testcase and open a gcc bug (with full
gcc command line)?

--
Markus

2015-04-22 05:59:03

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

* Markus Trippelsdorf | 2015-04-22 07:24:41 [+0200]:

Hey Markus

>I cannot reproduce this issue with my config with 4.8, 4.9 or 5. Could
>you please come up with a small testcase and open a gcc bug (with full
>gcc command line)?

the attached config file should do the trick. Can you try it?

Hagen


Attachments:
(No filename) (301.00 B)
config (185.87 kB)
Download all attachments

2015-04-22 09:20:46

by Markus Trippelsdorf

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On 2015.04.22 at 07:58 +0200, Hagen Paul Pfeifer wrote:
> * Markus Trippelsdorf | 2015-04-22 07:24:41 [+0200]:
> >I cannot reproduce this issue with my config with 4.8, 4.9 or 5. Could
> >you please come up with a small testcase and open a gcc bug (with full
> >gcc command line)?
>
> the attached config file should do the trick. Can you try it?

It is definitely a -O2 vs. -Os issue. With your config:

-Os:
% nm vmlinux | grep " atomic_" | wc -l
736

-O2 is fine:
% nm vmlinux | grep " atomic_"
ffffffff852d8300 r atomic_counters_ops
ffffffff83d66bf0 t atomic_counters_read
ffffffff8119cd30 T atomic_dec_and_mutex_lock
ffffffff81172e10 T atomic_notifier_call_chain
ffffffff81172b90 T atomic_notifier_chain_register
ffffffff81172d70 T atomic_notifier_chain_unregister
ffffffff844b0ef0 t atomic_read_file
ffffffff85a0c260 r atomic_ro_fops
ffffffff87616000 d atomic_rw
ffffffff852d8400 r atomic_stats_ops
ffffffff83d66c50 t atomic_stats_read

The easiest fix would be to force always-inline in
include/linux/compiler-gcc.h when CONFIG_CC_OPTIMIZE_FOR_SIZE is set.

And BTW only x86 and tile define ARCH_SUPPORTS_OPTIMIZED_INLINING, so
the rest of the architectures are not affected anyway.

--
Markus

2015-04-22 09:29:00

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics


* Markus Trippelsdorf <[email protected]> wrote:

> On 2015.04.22 at 07:58 +0200, Hagen Paul Pfeifer wrote:
> > * Markus Trippelsdorf | 2015-04-22 07:24:41 [+0200]:
> > >I cannot reproduce this issue with my config with 4.8, 4.9 or 5. Could
> > >you please come up with a small testcase and open a gcc bug (with full
> > >gcc command line)?
> >
> > the attached config file should do the trick. Can you try it?
>
> It is definitely a -O2 vs. -Os issue. With your config:
>
> -Os:
> % nm vmlinux | grep " atomic_" | wc -l
> 736
>
> -O2 is fine:
> % nm vmlinux | grep " atomic_"
> ffffffff852d8300 r atomic_counters_ops
> ffffffff83d66bf0 t atomic_counters_read
> ffffffff8119cd30 T atomic_dec_and_mutex_lock
> ffffffff81172e10 T atomic_notifier_call_chain
> ffffffff81172b90 T atomic_notifier_chain_register
> ffffffff81172d70 T atomic_notifier_chain_unregister
> ffffffff844b0ef0 t atomic_read_file
> ffffffff85a0c260 r atomic_ro_fops
> ffffffff87616000 d atomic_rw
> ffffffff852d8400 r atomic_stats_ops
> ffffffff83d66c50 t atomic_stats_read
>
> The easiest fix would be to force always-inline in
> include/linux/compiler-gcc.h when CONFIG_CC_OPTIMIZE_FOR_SIZE is set.

That pretty much defeats the purpose of any inlining size calculations
on -Os - and would make it impossible for better or bugfixed compilers
to improve the situation.

So I think the original patch makes sense (and I already applied it),
we want known-simple and performance critical methods (such as atomic
ops) always inlined.

Thanks,

Ingo

2015-04-22 09:31:32

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On 22 April 2015 at 11:28, Ingo Molnar <[email protected]> wrote:

> So I think the original patch makes sense (and I already applied it),
> we want known-simple and performance critical methods (such as atomic
> ops) always inlined.

Right. As I wrote I will post a follow up patch for the remaining
cases where inline is a must. To guarantee that performance critical
functions are always inlined - no matter what configuration is used.

Thank you!

Hagen

Subject: [tip:x86/asm] x86/asm: Always inline atomics

Commit-ID: 3462bd2adeadc49d9e126bca3b5536a3437a902d
Gitweb: http://git.kernel.org/tip/3462bd2adeadc49d9e126bca3b5536a3437a902d
Author: Hagen Paul Pfeifer <[email protected]>
AuthorDate: Mon, 20 Apr 2015 23:27:11 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 22 Apr 2015 08:14:41 +0200

x86/asm: Always inline atomics

During some code analysis I realized that atomic_add(), atomic_sub()
and friends are not necessarily inlined AND that each function
is defined multiple times:

atomic_inc: 544 duplicates
atomic_dec: 215 duplicates
atomic_dec_and_test: 107 duplicates
atomic64_inc: 38 duplicates
[...]

Each definition is exact equally, e.g.:

ffffffff813171b8 <atomic_add>:
55 push %rbp
48 89 e5 mov %rsp,%rbp
f0 01 3e lock add %edi,(%rsi)
5d pop %rbp
c3 retq

In turn each definition has one or more callsites (sure):

ffffffff81317c78: e8 3b f5 ff ff callq ffffffff813171b8 <atomic_add> [...]
ffffffff8131a062: e8 51 d1 ff ff callq ffffffff813171b8 <atomic_add> [...]
ffffffff8131a190: e8 23 d0 ff ff callq ffffffff813171b8 <atomic_add> [...]

The other way around would be to remove the static linkage - but
I prefer an enforced inlining here.

Before:
text data bss dec hex filename
81467393 19874720 20168704 121510817 73e1ba1 vmlinux.orig

After:
text data bss dec hex filename
81461323 19874720 20168704 121504747 73e03eb vmlinux.inlined

Yes, the inlining here makes the kernel even smaller! ;)

Linus further observed:

"I have this memory of having seen that before - the size
heuristics for gcc getting confused by inlining.
[...]

It might be a good idea to mark things that are basically just
wrappers around a single (or a couple of) asm instruction to be
always_inline."

Signed-off-by: Hagen Paul Pfeifer <[email protected]>
Acked-by: Linus Torvalds <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/atomic.h | 16 ++++++++--------
arch/x86/include/asm/atomic64_64.h | 8 ++++----
2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 5e5cd12..75a9ee8 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
*
* Atomically adds @i to @v.
*/
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "addl %1,%0"
: "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
{
asm volatile(LOCK_PREFIX "subl %1,%0"
: "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
}
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
{
asm volatile(LOCK_PREFIX "incl %0"
: "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
{
asm volatile(LOCK_PREFIX "decl %0"
: "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
{
GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
}
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
{
return i + xadd(&v->counter, i);
}
@@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
* Atomically adds @a to @v, so long as @v was not already @u.
* Returns the old value of @v.
*/
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
int c, old;
c = atomic_read(v);
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index f8d273e..b965f9e 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
*
* Atomically adds @i to @v.
*/
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "addq %1,%0"
: "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "incq %0"
: "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
{
asm volatile(LOCK_PREFIX "decq %0"
: "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
*
* Atomically adds @i to @v and returns @i + @v
*/
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
{
return i + xadd(&v->counter, i);
}

Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Wed, 22 Apr 2015, Hagen Paul Pfeifer wrote:
> On 22 April 2015 at 11:28, Ingo Molnar <[email protected]> wrote:
> > So I think the original patch makes sense (and I already applied it),
> > we want known-simple and performance critical methods (such as atomic
> > ops) always inlined.
>
> Right. As I wrote I will post a follow up patch for the remaining
> cases where inline is a must. To guarantee that performance critical
> functions are always inlined - no matter what configuration is used.

It would be nice to see those patches in -stable eventually, please...

--
"One disk to rule them all, One disk to find them. One disk to bring
them all and in the darkness grind them. In the Land of Redmond
where the shadows lie." -- The Silicon Valley Tarot
Henrique Holschuh

2015-07-13 18:25:25

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Wed, Apr 22, 2015 at 11:20 AM, Markus Trippelsdorf
<[email protected]> wrote:
> On 2015.04.22 at 07:58 +0200, Hagen Paul Pfeifer wrote:
>> * Markus Trippelsdorf | 2015-04-22 07:24:41 [+0200]:
>> >I cannot reproduce this issue with my config with 4.8, 4.9 or 5. Could
>> >you please come up with a small testcase and open a gcc bug (with full
>> >gcc command line)?
>>
>> the attached config file should do the trick. Can you try it?
>
> It is definitely a -O2 vs. -Os issue. With your config:


It's true that it happens more often with -Os, but -O2 is not immune either.

See comments in the gcc bug:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

2015-07-13 18:27:32

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Wed, Apr 22, 2015 at 11:28 AM, Ingo Molnar <[email protected]> wrote:
> So I think the original patch makes sense (and I already applied it),
> we want known-simple and performance critical methods (such as atomic
> ops) always inlined.

I will send more such force-inlining patches your way then.

2015-07-13 19:29:55

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

> On July 13, 2015 at 8:27 PM Denys Vlasenko <[email protected]>
> wrote:

> > So I think the original patch makes sense (and I already applied it),
> > we want known-simple and performance critical methods (such as atomic
> > ops) always inlined.
>
> I will send more such force-inlining patches your way then.

Now? We discuss this a month back or so? :-)

Anyway, if so I prefer to group the patchset in at least two patches to
simplify review:

- performance criticial group
- inline assembler group

The former group contains all inline critical functions and the later to
identify functions where gcc has problems because their inline heuristic
for asm code is a toy.

Hagen

2015-07-13 21:06:24

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

On Mon, Jul 13, 2015 at 9:20 PM, Hagen Paul Pfeifer <[email protected]> wrote:
>> On July 13, 2015 at 8:27 PM Denys Vlasenko <[email protected]>
>> wrote:
>
>> > So I think the original patch makes sense (and I already applied it),
>> > we want known-simple and performance critical methods (such as atomic
>> > ops) always inlined.
>>
>> I will send more such force-inlining patches your way then.
>
> Now? We discuss this a month back or so? :-)

Not *your* way.
I meant that I'll send patches to Ingo - I see that he agrees that
this worth fixing instead of waiting for a better compiler.

(Ideally, the compiler shouldn't do this, but gcc BZ comments
are not encouraging).

2015-07-13 21:12:33

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH RFC] x86: enforce inlining for atomics

> On July 13, 2015 at 11:06 PM Denys Vlasenko <[email protected]>
> wrote:
>
> Not *your* way.
> I meant that I'll send patches to Ingo - I see that he agrees that
> this worth fixing instead of waiting for a better compiler.

Yeah, do it. Sorry it was not my intention to prevent you to fix this -
not at all. Go on, *every* patch will make things better.

> (Ideally, the compiler shouldn't do this, but gcc BZ comments
> are not encouraging).

Right, the inline heuristic - as Linus already noted - is quite bad for
some cases (especially inline asm).

Hagen