2015-08-04 14:15:22

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 1/3] linux/bitmap: Force inlining of bitmap weight functions

With this config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

bitmap_weight (55 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 3f 3a 8b 00 callq __bitmap_weight
5d pop %rbp
c3 retq

hweight_long (23 copies):
55 push %rbp
e8 b5 65 8e 00 callq __sw_hweight64
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

While at it, replaced two "__inline__" with usual "inline"
(the rest of the source file uses the latter).

text data bss dec hex filename
86971357 17195880 36659200 140826437 864d745 vmlinux.before
86971120 17195912 36659200 140826232 864d678 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
---
include/linux/bitmap.h | 2 +-
include/linux/bitops.h | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index ea17cca..9653fdb 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -295,7 +295,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
return find_first_zero_bit(src, nbits) == nbits;
}

-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 297f5bd..e635533 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -57,7 +57,7 @@ extern unsigned long __sw_hweight64(__u64 w);
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))

-static __inline__ int get_bitmask_order(unsigned int count)
+static inline int get_bitmask_order(unsigned int count)
{
int order;

@@ -65,7 +65,7 @@ static __inline__ int get_bitmask_order(unsigned int count)
return order; /* We could be slightly more clever with -1 here... */
}

-static __inline__ int get_count_order(unsigned int count)
+static inline int get_count_order(unsigned int count)
{
int order;

@@ -75,7 +75,7 @@ static __inline__ int get_count_order(unsigned int count)
return order;
}

-static inline unsigned long hweight_long(unsigned long w)
+static __always_inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
}
--
1.8.1.4


2015-08-04 14:15:27

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 2/3] x86/hweight: Force inlining of __arch_hweight{32,64}()

With this config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

__arch_hweight32 (35 copies):
55 push %rbp
e8 66 9b 4a 00 callq __sw_hweight32
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

__arch_hweight64 (8 copies):
55 push %rbp
e8 5e c2 8a 00 callq __sw_hweight64
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

To avoid touching 32-bit case where such change was not tested to be a win,
reformat __arch_hweight64() to have completely disjoint 64-bit and 32-bit
implementations. IOW: made #ifdef / #else / #endif blocks contain
complete function definitions for 32 bits and 64 bits
instead of having #ifdef / #else / #endif inside a single function body.
Only 64-bit __arch_hweight64() is __always_inline'd.

text data bss dec hex filename
86971120 17195912 36659200 140826232 864d678 vmlinux.before
86970954 17195912 36659200 140826066 864d5d2 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
---
arch/x86/include/asm/arch_hweight.h | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 9686c3d..259a7c1 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -21,7 +21,7 @@
* ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
* compiler switches.
*/
-static inline unsigned int __arch_hweight32(unsigned int w)
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res = 0;

@@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w)
return __arch_hweight32(w & 0xff);
}

+#ifdef CONFIG_X86_32
static inline unsigned long __arch_hweight64(__u64 w)
{
- unsigned long res = 0;
-
-#ifdef CONFIG_X86_32
return __arch_hweight32((u32)w) +
__arch_hweight32((u32)(w >> 32));
+}
#else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+ unsigned long res = 0;
+
asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));
-#endif /* CONFIG_X86_32 */

return res;
}
+#endif /* CONFIG_X86_32 */

#endif
--
1.8.1.4

2015-08-04 14:15:29

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 3/3] jiffies: Force inlining of {m,u}msecs_to_jiffies()

With this config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os
gcc-4.7.2 generates many copies of these tiny functions:

msecs_to_jiffies (45 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 59 ec 03 00 callq __msecs_to_jiffies
5d pop %rbp
c3 retq

usecs_to_jiffies (10 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 5d 54 5e ff callq __usecs_to_jiffies
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

text data bss dec hex filename
86970954 17195912 36659200 140826066 864d5d2 vmlinux.before
86966150 17195912 36659200 140821262 864c30e vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
---
include/linux/jiffies.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 535fd3b..1ba48a1 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -351,7 +351,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
-static inline unsigned long msecs_to_jiffies(const unsigned int m)
+static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
{
if (__builtin_constant_p(m)) {
if ((int)m < 0)
@@ -405,7 +405,7 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u)
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
-static inline unsigned long usecs_to_jiffies(const unsigned int u)
+static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
{
if (__builtin_constant_p(u)) {
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
--
1.8.1.4

Subject: [tip:core/types] linux/bitmap: Force inlining of bitmap weight functions

Commit-ID: 1a1d48a4a8fde49aedc045d894efe67173d59fe0
Gitweb: http://git.kernel.org/tip/1a1d48a4a8fde49aedc045d894efe67173d59fe0
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 4 Aug 2015 16:15:14 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 5 Aug 2015 09:38:08 +0200

linux/bitmap: Force inlining of bitmap weight functions

With this config:

http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

bitmap_weight (55 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 3f 3a 8b 00 callq __bitmap_weight
5d pop %rbp
c3 retq

hweight_long (23 copies):
55 push %rbp
e8 b5 65 8e 00 callq __sw_hweight64
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

While at it, replaced two "__inline__" with usual "inline"
(the rest of the source file uses the latter).

text data bss dec filename
86971357 17195880 36659200 140826437 vmlinux.before
86971120 17195912 36659200 140826232 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/bitmap.h | 2 +-
include/linux/bitops.h | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index ea17cca..9653fdb 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -295,7 +295,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
return find_first_zero_bit(src, nbits) == nbits;
}

-static inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
+static __always_inline int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 297f5bd..e635533 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -57,7 +57,7 @@ extern unsigned long __sw_hweight64(__u64 w);
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))

-static __inline__ int get_bitmask_order(unsigned int count)
+static inline int get_bitmask_order(unsigned int count)
{
int order;

@@ -65,7 +65,7 @@ static __inline__ int get_bitmask_order(unsigned int count)
return order; /* We could be slightly more clever with -1 here... */
}

-static __inline__ int get_count_order(unsigned int count)
+static inline int get_count_order(unsigned int count)
{
int order;

@@ -75,7 +75,7 @@ static __inline__ int get_count_order(unsigned int count)
return order;
}

-static inline unsigned long hweight_long(unsigned long w)
+static __always_inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
}

Subject: [tip:core/types] x86/hweight: Force inlining of __arch_hweight{32 ,64}()

Commit-ID: d14edb1648221e59fc9fd47127fcc57bf26d759f
Gitweb: http://git.kernel.org/tip/d14edb1648221e59fc9fd47127fcc57bf26d759f
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 4 Aug 2015 16:15:15 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 5 Aug 2015 09:38:09 +0200

x86/hweight: Force inlining of __arch_hweight{32,64}()

With this config:

http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

__arch_hweight32 (35 copies):
55 push %rbp
e8 66 9b 4a 00 callq __sw_hweight32
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

__arch_hweight64 (8 copies):
55 push %rbp
e8 5e c2 8a 00 callq __sw_hweight64
48 89 e5 mov %rsp,%rbp
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

To avoid touching 32-bit case where such change was not tested
to be a win, reformat __arch_hweight64() to have completely
disjoint 64-bit and 32-bit implementations. IOW: made #ifdef /
32 bits and 64 bits instead of having #ifdef / #else / #endif
inside a single function body. Only 64-bit __arch_hweight64() is
__always_inline'd.

text data bss dec filename
86971120 17195912 36659200 140826232 vmlinux.before
86970954 17195912 36659200 140826066 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/arch_hweight.h | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 9686c3d..259a7c1 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -21,7 +21,7 @@
* ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
* compiler switches.
*/
-static inline unsigned int __arch_hweight32(unsigned int w)
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res = 0;

@@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w)
return __arch_hweight32(w & 0xff);
}

+#ifdef CONFIG_X86_32
static inline unsigned long __arch_hweight64(__u64 w)
{
- unsigned long res = 0;
-
-#ifdef CONFIG_X86_32
return __arch_hweight32((u32)w) +
__arch_hweight32((u32)(w >> 32));
+}
#else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+ unsigned long res = 0;
+
asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));
-#endif /* CONFIG_X86_32 */

return res;
}
+#endif /* CONFIG_X86_32 */

#endif

Subject: [tip:core/types] jiffies: Force inlining of {m,u}msecs_to_jiffies ()

Commit-ID: accd0b9ec015d611eb7783dd86f1bb31bf8d62ab
Gitweb: http://git.kernel.org/tip/accd0b9ec015d611eb7783dd86f1bb31bf8d62ab
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 4 Aug 2015 16:15:16 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 5 Aug 2015 09:38:09 +0200

jiffies: Force inlining of {m,u}msecs_to_jiffies()

With this config:

http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

gcc-4.7.2 generates many copies of these tiny functions:

msecs_to_jiffies (45 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 59 ec 03 00 callq __msecs_to_jiffies
5d pop %rbp
c3 retq

usecs_to_jiffies (10 copies):
55 push %rbp
48 89 e5 mov %rsp,%rbp
e8 5d 54 5e ff callq __usecs_to_jiffies
5d pop %rbp
c3 retq

See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

This patch fixes this via s/inline/__always_inline/

text data bss dec filename
86970954 17195912 36659200 140826066 vmlinux.before
86966150 17195912 36659200 140821262 vmlinux

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Thomas Graf <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/jiffies.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 535fd3b..1ba48a1 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -351,7 +351,7 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
-static inline unsigned long msecs_to_jiffies(const unsigned int m)
+static __always_inline unsigned long msecs_to_jiffies(const unsigned int m)
{
if (__builtin_constant_p(m)) {
if ((int)m < 0)
@@ -405,7 +405,7 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u)
* directly here and from __msecs_to_jiffies() in the case where
* constant folding is not possible.
*/
-static inline unsigned long usecs_to_jiffies(const unsigned int u)
+static __always_inline unsigned long usecs_to_jiffies(const unsigned int u)
{
if (__builtin_constant_p(u)) {
if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))