In the overall kernel source there currently are
2544 msecs_to_jiffies
126 usecs_to_jiffies
and a few places that are using var * HZ / 1000 constructs
which are not always safe (no check of corner cases) and should
be switched to msecs_to_jiffies (roughly 25 left).
Allowing gcc to fold constants for these calls that in most
cases are passing in constants (roughly 95%) has some potential
to improve performance (and should save a few bytes).
size impact:
x86_64_defconfig:
without patches vmlinux 24628843
with patches vmlinux 24628797
ppc64_defconfig:
without patches 27412024
with patches 27412040 (no clue why it is bigger!)
multi_v7_defconfig:
without patches vmlinux.o 22901462
with patches vmlinux.o 22899843
As the changes to the top level Kbuild will impact every architecture
this is probably not enough testing - but should be suitable for a first
review.
Once this is clean a patch for usecs_to_jiffies will be provided as well
The patch set:
0001 moves timeconst.h from kernel/time/ to include/generated/ and makes
it available early enough so that the build can use the constants
for msecs_to_jiffies
0002 rename msecs_to_jiffies to __msecs_to_jiffies, the only modification
is that the < 0 is moved out.
modified version of msecs_to_jiffies that checks for constant via
call to __builtin_constant_p() and calls __msecs_to_jiffies if it
can't determine that the argument is constant.
0003 documentation update and reformatting to kernel-doc format
kernel/time/timeconst.h is moved to include/generated/ and generated in
an early build stage by top level Kbuild. This allows using timeconst.h
in an earlier stage of the build.
Signed-off-by: Nicholas Mc Guire <[email protected]>
---
Thanks to Joe Perches <[email protected]> for suggesting this approach and
catching the unconditional rebuild (should be fixed here now properly) as well
as for his review comments on the first attempts.
Some questions:
* Kbuild - is passing in arguments to .bc files via pipe and read();
rather than using multiple files acceptable or is there some reason
for the original multifile solution that Im overlooking ?
* conditional rebuild of timeconst.h with $(call filechk,gentimeconst)
not really clear if this is going to do all rebuilds that could be
necessary and not clear how to verify this. currently only checked by
1) defconfig -> build -> rebuild -> CHK is executed timeconst.h unmodified
2) defconfig -> build -> change HZ -> rebuild -> UPD timeconst.h resulting
in a rebuild of most files.
Patch was compile tested for x86_64_defconfig,multi_v7_defconfig,
ppc64_defconfig, and boot/run
tested on x86_64. Further a test-case with an invalid HZ value to trigger the
error output in kernel/time/timeconst.bc was run.
Patch is against 4.0-rc6 (localversion-next is -next-20150402)
Kbuild | 30 +++++++++++++++++++++++++-----
kernel/time/Makefile | 17 +----------------
kernel/time/time.c | 2 +-
kernel/time/timeconst.bc | 3 ++-
4 files changed, 29 insertions(+), 23 deletions(-)
diff --git a/Kbuild b/Kbuild
index 96d0629..4e1ed0b 100644
--- a/Kbuild
+++ b/Kbuild
@@ -2,8 +2,9 @@
# Kbuild for top-level directory of the kernel
# This file takes care of the following:
# 1) Generate bounds.h
-# 2) Generate asm-offsets.h (may need bounds.h)
-# 3) Check for missing system calls
+# 2) Generate timeconst.h
+# 3) Generate asm-offsets.h (may need bounds.h)
+# 4) Check for missing system calls
# Default sed regexp - multiline due to syntax constraints
define sed-y
@@ -47,7 +48,26 @@ $(obj)/$(bounds-file): kernel/bounds.s FORCE
$(call filechk,offsets,__LINUX_BOUNDS_H__)
#####
-# 2) Generate asm-offsets.h
+# 2) Generate timeconst.h
+
+timeconst-file := include/generated/timeconst.h
+
+always += $(timeconst-file)
+targets += $(timeconst-file)
+
+quiet_cmd_gentimeconst = GEN $@
+define cmd_gentimeconst
+ (echo $(CONFIG_HZ) | bc -q $< ) > $@
+endef
+define filechk_gentimeconst
+ (echo $(CONFIG_HZ) | bc -q $< )
+endef
+
+$(obj)/$(timeconst-file): kernel/time/timeconst.bc FORCE
+ $(call filechk,gentimeconst)
+
+#####
+# 3) Generate asm-offsets.h
#
offsets-file := include/generated/asm-offsets.h
@@ -66,7 +86,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
$(call filechk,offsets,__ASM_OFFSETS_H__)
#####
-# 3) Check for missing system calls
+# 4) Check for missing system calls
#
always += missing-syscalls
@@ -79,4 +99,4 @@ missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE
$(call cmd,syscalls)
# Keep these two files during make clean
-no-clean-files := $(bounds-file) $(offsets-file)
+no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file)
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 01f0312..ffc4cc3 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,19 +13,4 @@ obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE $@
- cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
- $(call if_changed,hzfile)
-
-quiet_cmd_bc = BC $@
- cmd_bc = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
- $(call if_changed,bc)
-
+$(obj)/time.o: $(objtree)/include/config/
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b77..4fa1d26 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
-#include "timeconst.h"
+#include <generated/timeconst.h>
#include "timekeeping.h"
/*
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2..c7388de 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
print "#include <linux/types.h>\n\n"
print "#if HZ != ", hz, "\n"
- print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+ print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
print "#endif\n\n"
if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
halt
}
+hz = read();
timeconst(hz)
--
1.7.10.4
The majority of the msecs_to_jiffies() users in the kernel are passing in
constants which would allow gcc to do constant folding by checking with
__builtin_constant_p() in msecs_to_jiffies().
The original msecs_to_jiffies is renamed to __msecs_to_jiffies and aside
from the removal of the check for negative values being moved out, is
unaltered.
Signed-off-by: Nicholas Mc Guire <[email protected]>
---
Thanks to Joe Perches <[email protected]> for suggesting this approach and
for his review comments on the first attempts.
Patch was compile and boot tested (x86_64) and a few msecs_to_jiffies()
locations verified by inspection of the .lst files.
Patch is against 4.0-rc6 (localversion-next is -next-20150402)
include/linux/jiffies.h | 29 ++++++++++++++++++++++++++++-
kernel/time/time.c | 13 +++++--------
2 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index c367cbd..dcd8ba5 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -7,6 +7,7 @@
#include <linux/time.h>
#include <linux/timex.h>
#include <asm/param.h> /* for HZ */
+#include <generated/timeconst.h>
/*
* The following defines establish the engineering parameters of the PLL
@@ -288,7 +289,33 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
}
-extern unsigned long msecs_to_jiffies(const unsigned int m);
+extern unsigned long __msecs_to_jiffies(const unsigned int m);
+static inline unsigned long msecs_to_jiffies(const unsigned int m)
+{
+ /*
+ * Negative value, means infinite timeout:
+ */
+ if ((int)m < 0)
+ return MAX_JIFFY_OFFSET;
+
+ if (__builtin_constant_p(m)) {
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+ return m * (HZ / MSEC_PER_SEC);
+#else
+ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+ return MAX_JIFFY_OFFSET;
+
+ return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+ >> MSEC_TO_HZ_SHR32;
+#endif
+ } else
+ return __msecs_to_jiffies(m);
+}
+
extern unsigned long usecs_to_jiffies(const unsigned int u);
extern unsigned long timespec_to_jiffies(const struct timespec *value);
extern void jiffies_to_timespec(const unsigned long jiffies,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 4fa1d26..3797540 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -488,6 +488,8 @@ EXPORT_SYMBOL(ns_to_timespec64);
* the following way:
*
* - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ * negative values are handled in msecs_to_jiffies in
+ * include/linux/jiffies.h
*
* - 'too large' values [that would result in larger than
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
@@ -496,15 +498,10 @@ EXPORT_SYMBOL(ns_to_timespec64);
* the input value by a factor or dividing it with a factor
*
* We must also be careful about 32-bit overflows.
+ *
*/
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
{
- /*
- * Negative value, means infinite timeout:
- */
- if ((int)m < 0)
- return MAX_JIFFY_OFFSET;
-
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
/*
* HZ is equal to or smaller than 1000, and 1000 is a nice
@@ -537,7 +534,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
>> MSEC_TO_HZ_SHR32;
#endif
}
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
unsigned long usecs_to_jiffies(const unsigned int u)
{
--
1.7.10.4
update the documentation of msecs_to_jiffies and move to kernel-doc format
Signed-off-by: Nicholas Mc Guire <[email protected]>
---
Patch was compile with x86_64_defconfig
Patch is against 4.0-rc6 (localversion-next is -next-20150402)
include/linux/jiffies.h | 23 +++++++++++++++++++++++
kernel/time/time.c | 19 ++++++++++---------
2 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index dcd8ba5..a75158e 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -290,6 +290,29 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
}
extern unsigned long __msecs_to_jiffies(const unsigned int m);
+
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m: time in millisecons
+ *
+ * conversion is done as follows:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ * the input value by a factor or dividing it with a factor and
+ * handling any 32-bit overflows.
+ * for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ */
static inline unsigned long msecs_to_jiffies(const unsigned int m)
{
/*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3797540..5d97610 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -483,22 +483,23 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
}
EXPORT_SYMBOL(ns_to_timespec64);
#endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+
+/**
+ * __msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m: time in millisecons
*
- * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
- * negative values are handled in msecs_to_jiffies in
- * include/linux/jiffies.h
+ * conversion is done as follows:
*
* - 'too large' values [that would result in larger than
* MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
*
* - all other values are converted to jiffies by either multiplying
- * the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ * the input value by a factor or dividing it with a factor and
+ * handling any 32-bit overflows.
*
+ * __msecs_to_jiffies() is called if the value passed to
+ * msecs_to_jiffies() does not allow constant folding and the actual
+ * conversion must be done at runtime.
*/
unsigned long __msecs_to_jiffies(const unsigned int m)
{
--
1.7.10.4
On Sun, 2015-04-05 at 09:23 +0200, Nicholas Mc Guire wrote:
> Allowing gcc to fold constants for these calls that in most
> cases are passing in constants (roughly 95%) has some potential
> to improve performance (and should save a few bytes).
Thanks for working on this Nicholas.
On Sun, 2015-04-05 at 09:23 +0200, Nicholas Mc Guire wrote:
> The majority of the msecs_to_jiffies() users in the kernel are passing in
> constants which would allow gcc to do constant folding by checking with
> __builtin_constant_p() in msecs_to_jiffies().
>
> The original msecs_to_jiffies is renamed to __msecs_to_jiffies and aside
> from the removal of the check for negative values being moved out, is
> unaltered.
At least for gcc 4.9, this doesn't allow the compiler
to optimize / precalculation msecs_to_jiffies calls
with a constant.
This does: (on top of your patch x86-64 defconfig)
$ size vmlinux.o.*
text data bss dec hex filename
11770523 1505971 1018454 14294948 da1fa4 vmlinux.o.next-b0a12fb5bc8
11770530 1505971 1018454 14294955 da1fab vmlinux.o.next-b0a12fb5bc8-inline
11768734 1505971 1018454 14293159 da18a7 vmlinux.o.next-b0a12fb5bc8-macro
I think this should still move the if (m) < 0 back into the
original __msecs_to_jiffies function.
---
include/linux/jiffies.h | 71 ++++++++++++++++++++++++++++++++-----------------
1 file changed, 46 insertions(+), 25 deletions(-)
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index a75158e..f8fe9f7 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -291,6 +291,39 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
extern unsigned long __msecs_to_jiffies(const unsigned int m);
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+#define const_msecs_to_jiffies(m) \
+({ \
+ unsigned long j; \
+ if ((int)m < 0) \
+ j = MAX_JIFFY_OFFSET; \
+ else \
+ j = (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); \
+ j; \
+})
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+#define const_msecs_to_jiffies(m) \
+({ \
+ unsigned long j; \
+ if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) \
+ j = MAX_JIFFY_OFFSET; \
+ else \
+ j = m * (HZ / MSEC_PER_SEC); \
+ j; \
+})
+#else
+#define const_msecs_to_jiffies(m) \
+({ \
+ unsigned long j; \
+ if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))\
+ j = MAX_JIFFY_OFFSET; \
+ else \
+ j = (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) \
+ >> MSEC_TO_HZ_SHR32; \
+ j; \
+})
+#endif
+
/**
* msecs_to_jiffies: - convert milliseconds to jiffies
* @m: time in millisecons
@@ -313,31 +346,19 @@ extern unsigned long __msecs_to_jiffies(const unsigned int m);
* allow constant folding and the actual conversion must be done at
* runtime.
*/
-static inline unsigned long msecs_to_jiffies(const unsigned int m)
-{
- /*
- * Negative value, means infinite timeout:
- */
- if ((int)m < 0)
- return MAX_JIFFY_OFFSET;
-
- if (__builtin_constant_p(m)) {
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
- return m * (HZ / MSEC_PER_SEC);
-#else
- if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
- return MAX_JIFFY_OFFSET;
-
- return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
- >> MSEC_TO_HZ_SHR32;
-#endif
- } else
- return __msecs_to_jiffies(m);
-}
+#define msecs_to_jiffies(m) \
+({ \
+ unsigned long j; \
+ if (__builtin_constant_p(m)) { \
+ if ((int)m < 0) \
+ j = MAX_JIFFY_OFFSET; \
+ else \
+ j = const_msecs_to_jiffies(m); \
+ } else { \
+ j = __msecs_to_jiffies(m); \
+ } \
+ j; \
+})
extern unsigned long usecs_to_jiffies(const unsigned int u);
extern unsigned long timespec_to_jiffies(const struct timespec *value);
On Sun, 05 Apr 2015, Joe Perches wrote:
> On Sun, 2015-04-05 at 09:23 +0200, Nicholas Mc Guire wrote:
> > The majority of the msecs_to_jiffies() users in the kernel are passing in
> > constants which would allow gcc to do constant folding by checking with
> > __builtin_constant_p() in msecs_to_jiffies().
> >
> > The original msecs_to_jiffies is renamed to __msecs_to_jiffies and aside
> > from the removal of the check for negative values being moved out, is
> > unaltered.
>
> At least for gcc 4.9, this doesn't allow the compiler
> to optimize / precalculation msecs_to_jiffies calls
> with a constant.
>
> This does: (on top of your patch x86-64 defconfig)
>
> $ size vmlinux.o.*
> text data bss dec hex filename
> 11770523 1505971 1018454 14294948 da1fa4 vmlinux.o.next-b0a12fb5bc8
> 11770530 1505971 1018454 14294955 da1fab vmlinux.o.next-b0a12fb5bc8-inline
> 11768734 1505971 1018454 14293159 da18a7 vmlinux.o.next-b0a12fb5bc8-macro
>
> I think this should still move the if (m) < 0 back into the
> original __msecs_to_jiffies function.
>
could you check if you can reproduce the results below ?
my assumption was that gcc would always optimize out an
if(CONST < 0) return CONST; reducing it to the return CONST;
only and thus this should not make any difference but Im not
that familiar with gcc.
gcc versions here are:
for x86 gcc version 4.7.2 (Debian 4.7.2-5)
for powerpc it is a gcc version 4.9.2 (crosstool-NG 1.20.0)
for arm gcc version 4.9.2 20140904 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09)
Procedure used:
root@debian:~/linux-next# make distclean
root@debian:~/linux-next# make defconfig
root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.lst
root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.s
same setup in unpatched /usr/src/linux-next/
e.g:
root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.c
timeout = jiffies + msecs_to_jiffies(1000);
timeout = jiffies + msecs_to_jiffies(1000);
So both calls are constants and should be optimized out if it works as
expected.
without the patch applied:
root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
call msecs_to_jiffies #
call msecs_to_jiffies #
root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
timeout = jiffies + msecs_to_jiffies(1000);
e19: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
timeout = jiffies + msecs_to_jiffies(1000);
timeout = jiffies + msecs_to_jiffies(1000);
fd8: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
timeout = jiffies + msecs_to_jiffies(1000);
with the patch applied this then gives me:
root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
timeout = jiffies + msecs_to_jiffies(1000);
timeout = jiffies + msecs_to_jiffies(1000);
timeout = jiffies + msecs_to_jiffies(1000);
timeout = jiffies + msecs_to_jiffies(1000);
Conversely in kernel/sched/core.c the msecs_to_jiffies is not a constant
and the result is that it calls __msecs_to_jiffies
patched:
root@debian:~/linux-next# grep msecs_to_jiffies kernel/sched/core.s
call __msecs_to_jiffies #
unpatched:
root@debian:/usr/src/linux-next# grep msecs_to_jiffies kernel/sched/core.s
call msecs_to_jiffies #
Could you check if you get these results for this test-case ?
If this really were compiler dependant that would be very bad.
I did move the < 0 check - but that did not change the situation here.
but it well may be that there are some cases where this does make a
difference
thx!
hofrat
On Mon, 2015-04-06 at 03:00 +0200, Nicholas Mc Guire wrote:
> On Sun, 05 Apr 2015, Joe Perches wrote:
>
> > On Sun, 2015-04-05 at 09:23 +0200, Nicholas Mc Guire wrote:
> > > The majority of the msecs_to_jiffies() users in the kernel are passing in
> > > constants which would allow gcc to do constant folding by checking with
> > > __builtin_constant_p() in msecs_to_jiffies().
> > >
> > > The original msecs_to_jiffies is renamed to __msecs_to_jiffies and aside
> > > from the removal of the check for negative values being moved out, is
> > > unaltered.
> >
> > At least for gcc 4.9, this doesn't allow the compiler
> > to optimize / precalculation msecs_to_jiffies calls
> > with a constant.
> >
> > This does: (on top of your patch x86-64 defconfig)
> >
> > $ size vmlinux.o.*
> > text data bss dec hex filename
> > 11770523 1505971 1018454 14294948 da1fa4 vmlinux.o.next-b0a12fb5bc8
> > 11770530 1505971 1018454 14294955 da1fab vmlinux.o.next-b0a12fb5bc8-inline
> > 11768734 1505971 1018454 14293159 da18a7 vmlinux.o.next-b0a12fb5bc8-macro
> >
> > I think this should still move the if (m) < 0 back into the
> > original __msecs_to_jiffies function.
> >
>
> could you check if you can reproduce the results below ?
> my assumption was that gcc would always optimize out an
> if(CONST < 0) return CONST; reducing it to the return CONST;
> only and thus this should not make any difference but Im not
> that familiar with gcc.
>
> gcc versions here are:
> for x86 gcc version 4.7.2 (Debian 4.7.2-5)
> for powerpc it is a gcc version 4.9.2 (crosstool-NG 1.20.0)
> for arm gcc version 4.9.2 20140904 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09)
>
> Procedure used:
> root@debian:~/linux-next# make distclean
> root@debian:~/linux-next# make defconfig
> root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.lst
> root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.s
>
> same setup in unpatched /usr/src/linux-next/
>
> e.g:
> root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.c
> timeout = jiffies + msecs_to_jiffies(1000);
> timeout = jiffies + msecs_to_jiffies(1000);
>
> So both calls are constants and should be optimized out if it works as
> expected.
>
> without the patch applied:
>
> root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
> call msecs_to_jiffies #
> call msecs_to_jiffies #
> root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
> timeout = jiffies + msecs_to_jiffies(1000);
> e19: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
> timeout = jiffies + msecs_to_jiffies(1000);
> timeout = jiffies + msecs_to_jiffies(1000);
> fd8: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
> timeout = jiffies + msecs_to_jiffies(1000);
>
>
> with the patch applied this then gives me:
>
> root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
> root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
> timeout = jiffies + msecs_to_jiffies(1000);
> timeout = jiffies + msecs_to_jiffies(1000);
> timeout = jiffies + msecs_to_jiffies(1000);
> timeout = jiffies + msecs_to_jiffies(1000);
>
> Conversely in kernel/sched/core.c the msecs_to_jiffies is not a constant
> and the result is that it calls __msecs_to_jiffies
>
> patched:
> root@debian:~/linux-next# grep msecs_to_jiffies kernel/sched/core.s
> call __msecs_to_jiffies #
>
> unpatched:
> root@debian:/usr/src/linux-next# grep msecs_to_jiffies kernel/sched/core.s
> call msecs_to_jiffies #
>
>
> Could you check if you get these results for this test-case ?
> If this really were compiler dependant that would be very bad.
Hi Nicholas.
Your inline version has not worked with any of
x86-64 gcc 4.4, 4.6, 4.7, or 4.9
I suggest you add some lines to
lib/test_module.c/test_module_init like:
unsigned int m;
for (m = 10; m < 200; m += 10)
pr_info("msecs_to_jiffies(%u) is %lu\n",
m, msecs_to_jiffies(m));
pr_info("msecs_to_jiffies(%u) is %lu\n",
10, msecs_to_jiffies(10));
pr_info("msecs_to_jiffies(%u) is %lu\n",
100, msecs_to_jiffies(100));
pr_info("msecs_to_jiffies(%u) is %lu\n",
1000, msecs_to_jiffies(1000));
Then it's pretty easy to look at the assembly/.lst file
Your inline function doesn't allow gcc to precompute
the msecs_to_jiffies value. The macro one does for all
those gcc versions.
Try it and look at the generated .lst files with and
without the patch I sent.
cheers, Joe
On Sun, 05 Apr 2015, Joe Perches wrote:
> On Mon, 2015-04-06 at 03:00 +0200, Nicholas Mc Guire wrote:
> > On Sun, 05 Apr 2015, Joe Perches wrote:
> >
> > > On Sun, 2015-04-05 at 09:23 +0200, Nicholas Mc Guire wrote:
> > > > The majority of the msecs_to_jiffies() users in the kernel are passing in
> > > > constants which would allow gcc to do constant folding by checking with
> > > > __builtin_constant_p() in msecs_to_jiffies().
> > > >
> > > > The original msecs_to_jiffies is renamed to __msecs_to_jiffies and aside
> > > > from the removal of the check for negative values being moved out, is
> > > > unaltered.
> > >
> > > At least for gcc 4.9, this doesn't allow the compiler
> > > to optimize / precalculation msecs_to_jiffies calls
> > > with a constant.
> > >
> > > This does: (on top of your patch x86-64 defconfig)
> > >
> > > $ size vmlinux.o.*
> > > text data bss dec hex filename
> > > 11770523 1505971 1018454 14294948 da1fa4 vmlinux.o.next-b0a12fb5bc8
> > > 11770530 1505971 1018454 14294955 da1fab vmlinux.o.next-b0a12fb5bc8-inline
> > > 11768734 1505971 1018454 14293159 da18a7 vmlinux.o.next-b0a12fb5bc8-macro
> > >
> > > I think this should still move the if (m) < 0 back into the
> > > original __msecs_to_jiffies function.
> > >
> >
> > could you check if you can reproduce the results below ?
> > my assumption was that gcc would always optimize out an
> > if(CONST < 0) return CONST; reducing it to the return CONST;
> > only and thus this should not make any difference but Im not
> > that familiar with gcc.
> >
> > gcc versions here are:
> > for x86 gcc version 4.7.2 (Debian 4.7.2-5)
> > for powerpc it is a gcc version 4.9.2 (crosstool-NG 1.20.0)
> > for arm gcc version 4.9.2 20140904 (prerelease) (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09)
> >
> > Procedure used:
> > root@debian:~/linux-next# make distclean
> > root@debian:~/linux-next# make defconfig
> > root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.lst
> > root@debian:~/linux-next# make drivers/net/wireless/p54/p54usb.s
> >
> > same setup in unpatched /usr/src/linux-next/
> >
> > e.g:
> > root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.c
> > timeout = jiffies + msecs_to_jiffies(1000);
> > timeout = jiffies + msecs_to_jiffies(1000);
> >
> > So both calls are constants and should be optimized out if it works as
> > expected.
> >
> > without the patch applied:
> >
> > root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
> > call msecs_to_jiffies #
> > call msecs_to_jiffies #
> > root@debian:/usr/src/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
> > timeout = jiffies + msecs_to_jiffies(1000);
> > e19: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
> > timeout = jiffies + msecs_to_jiffies(1000);
> > timeout = jiffies + msecs_to_jiffies(1000);
> > fd8: R_X86_64_PC32 msecs_to_jiffies+0xfffffffffffffffc
> > timeout = jiffies + msecs_to_jiffies(1000);
> >
> >
> > with the patch applied this then gives me:
> >
> > root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.s
> > root@debian:~/linux-next# grep msecs_to_jiffies drivers/net/wireless/p54/p54usb.lst
> > timeout = jiffies + msecs_to_jiffies(1000);
> > timeout = jiffies + msecs_to_jiffies(1000);
> > timeout = jiffies + msecs_to_jiffies(1000);
> > timeout = jiffies + msecs_to_jiffies(1000);
> >
> > Conversely in kernel/sched/core.c the msecs_to_jiffies is not a constant
> > and the result is that it calls __msecs_to_jiffies
> >
> > patched:
> > root@debian:~/linux-next# grep msecs_to_jiffies kernel/sched/core.s
> > call __msecs_to_jiffies #
> >
> > unpatched:
> > root@debian:/usr/src/linux-next# grep msecs_to_jiffies kernel/sched/core.s
> > call msecs_to_jiffies #
> >
> >
> > Could you check if you get these results for this test-case ?
> > If this really were compiler dependant that would be very bad.
>
> Hi Nicholas.
>
> Your inline version has not worked with any of
> x86-64 gcc 4.4, 4.6, 4.7, or 4.9
>
> I suggest you add some lines to
> lib/test_module.c/test_module_init like:
>
> unsigned int m;
>
> for (m = 10; m < 200; m += 10)
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> m, msecs_to_jiffies(m));
>
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 10, msecs_to_jiffies(10));
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 100, msecs_to_jiffies(100));
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 1000, msecs_to_jiffies(1000));
>
> Then it's pretty easy to look at the assembly/.lst file
>
> Your inline function doesn't allow gcc to precompute
> the msecs_to_jiffies value. The macro one does for all
> those gcc versions.
>
> Try it and look at the generated .lst files with and
> without the patch I sent.
>
will do that - thanks !
Managed to fool my self - the difference in the .lst/.s files is
simply due to chaning msecs_to_jiffies being inline
(which it was not) and thus it "vanished" - kind of stupid - sorry
back to gcc manual first - need to understand __builtin_constant_p
better and the constraints - from all that I understood it should
be doable both as macro and inline.
thx!
hofrat
On Mon, 2015-04-06 at 06:26 +0200, Nicholas Mc Guire wrote:
> On Sun, 05 Apr 2015, Joe Perches wrote:
> > Try it and look at the generated .lst files with and
> > without the patch I sent.
[]
> from all that I understood it should
> be doable both as macro and inline.
I think it _should_ be doable too but I also think
the only reason gcc doesn't optimize the inline
is because gcc's optimizer isn't good enough yet.
On Sun, 05 Apr 2015, Joe Perches wrote:
> On Mon, 2015-04-06 at 06:26 +0200, Nicholas Mc Guire wrote:
> > On Sun, 05 Apr 2015, Joe Perches wrote:
> > > Try it and look at the generated .lst files with and
> > > without the patch I sent.
> []
> > from all that I understood it should
> > be doable both as macro and inline.
>
> I think it _should_ be doable too but I also think
> the only reason gcc doesn't optimize the inline
> is because gcc's optimizer isn't good enough yet.
>
"unfortunately" I can't blame it on gcc - here is the initial toy-case
- test.c and either testi.h or testm.h included
- m = TIMEOUT or m = atoi(argv[1]);
both in the inline and the macro case gcc reduced the code to a single
load mediate or register instruction for the constant - so the optimizer
is doing its job.
test.c:
#include <stdio.h>
#define HZ 100
#define MSECS_PER_SEC 1000
#define TIMEOUT 100
#include "testi.h" /* inline msecs_to_jiffies */
//#include "testm.h" /* macro versions */
int main(int argc, char **argv) {
//int m = atoi(argv[0]); /* non-const */
int m = TIMEOUT; /* const */
printf("%lu\n",msecs_to_jiffies(m));
return 0;
}
testm.h:
#define msecs_to_jiffies(m) \
(__builtin_constant_p (m) \
? ((m) * HZ / MSECS_PER_SEC ) : __msecs_to_jiffies(m))
unsigned long __msecs_to_jiffies(int m)
{
return m * HZ / MSECS_PER_SEC ;
}
first case with a non-const
main:
.LFB12:
.cfi_startproc
subq $8, %rsp #,
.cfi_def_cfa_offset 16
movq 8(%rsi), %rdi # MEM[(char * *)argv_2(D) + 8B], MEM[(char * *)argv_2(D) + 8B]
xorl %eax, %eax #
call atoi #
movl $1717986919, %edx #, tmp69
movl %eax, %ecx #, m
movl $.LC0, %edi #,
imull %edx # tmp69
sarl $31, %ecx #, tmp71
xorl %eax, %eax #
sarl $2, %edx #, tmp67
subl %ecx, %edx # tmp71, tmp67
movslq %edx, %rsi # tmp67, tmp72
call printf #
o
second with a constant:
main:
.LFB12:
.cfi_startproc
subq $8, %rsp #,
.cfi_def_cfa_offset 16
movl $10, %esi #,
movl $.LC0, %edi #,
xorl %eax, %eax #
call printf #
inline:
-------
testi.h:
static inline unsigned long __msecs_to_jiffies(int m)
{
return m * HZ / MSECS_PER_SEC;
}
static inline unsigned long msecs_to_jiffies(int m)
{
return __builtin_constant_p (m) ?
(m) * HZ / MSECS_PER_SEC : __msecs_to_jiffies(m);
}
first case with a non-const
main:
.LFB13:
.cfi_startproc
subq $8, %rsp #,
.cfi_def_cfa_offset 16
movq (%rsi), %rdi # *argv_1(D),
xorl %eax, %eax #
call atoi #
movl $1717986919, %edx #, tmp68
movl %eax, %ecx #, m
movl $.LC0, %edi #,
imull %edx # tmp68
sarl $31, %ecx #, tmp70
xorl %eax, %eax #
sarl $2, %edx #, tmp66
subl %ecx, %edx # tmp70, tmp66
movslq %edx, %rsi # tmp66, tmp71
call printf #
second with a constant:
main:
.LFB13:
.cfi_startproc
subq $8, %rsp #,
.cfi_def_cfa_offset 16
xorl %esi, %esi #
movl $.LC0, %edi #,
xorl %eax, %eax #
call printf #
giving it another run from scratch somewhere I simply screwed up or
overlooked some detail.
thx!
hofrat
On Mon, 2015-04-06 at 08:40 +0200, Nicholas Mc Guire wrote:
> #define msecs_to_jiffies(m) \
> (__builtin_constant_p (m) \
> ? ((m) * HZ / MSECS_PER_SEC ) : __msecs_to_jiffies(m))
[]
> main:
> .LFB12:
> .cfi_startproc
> subq $8, %rsp #,
> .cfi_def_cfa_offset 16
> movl $10, %esi #,
> movl $.LC0, %edi #,
> xorl %eax, %eax #
> call printf #
vs:
> static inline unsigned long msecs_to_jiffies(int m)
> {
> return __builtin_constant_p (m) ?
> (m) * HZ / MSECS_PER_SEC : __msecs_to_jiffies(m);
> }
[]
> main:
> .LFB13:
> .cfi_startproc
> subq $8, %rsp #,
> .cfi_def_cfa_offset 16
> xorl %esi, %esi #
> movl $.LC0, %edi #,
> xorl %eax, %eax #
> call printf #
>
> giving it another run from scratch somewhere I simply screwed up or
> overlooked some detail.
If the optimizer was doing it's job properly, wouldn't
the macro and inline output object code be the same?
On Mon, 06 Apr 2015, Joe Perches wrote:
> On Mon, 2015-04-06 at 08:40 +0200, Nicholas Mc Guire wrote:
> > #define msecs_to_jiffies(m) \
> > (__builtin_constant_p (m) \
> > ? ((m) * HZ / MSECS_PER_SEC ) : __msecs_to_jiffies(m))
> []
> > main:
> > .LFB12:
> > .cfi_startproc
> > subq $8, %rsp #,
> > .cfi_def_cfa_offset 16
> > movl $10, %esi #,
> > movl $.LC0, %edi #,
> > xorl %eax, %eax #
> > call printf #
>
> vs:
>
> > static inline unsigned long msecs_to_jiffies(int m)
> > {
> > return __builtin_constant_p (m) ?
> > (m) * HZ / MSECS_PER_SEC : __msecs_to_jiffies(m);
> > }
> []
> > main:
> > .LFB13:
> > .cfi_startproc
> > subq $8, %rsp #,
> > .cfi_def_cfa_offset 16
> > xorl %esi, %esi #
> > movl $.LC0, %edi #,
> > xorl %eax, %eax #
> > call printf #
> >
> > giving it another run from scratch somewhere I simply screwed up or
> > overlooked some detail.
>
> If the optimizer was doing it's job properly, wouldn't
> the macro and inline output object code be the same?
>
yes - and they are - that was my mistake I grabed the
wrong asm snippet - here is the complete test case
also made a mess of the code while trimming down
the mail - so here is the single test case showing,
I think, that inline works as well and as expected.
testi.h:
#define HZ 100
#define MSECS_PER_SEC 1000
#define TIMEOUT 100
extern inline unsigned long __msecs_to_jiffies(int m);
unsigned long msecs_to_jiffies(int m)
{
return __builtin_constant_p(m) ? ((m) * HZ / MSECS_PER_SEC ) : __msecs_to_jiffies(m);
}
test.c:
#include <stdio.h>
#include "testi.h"
unsigned long __msecs_to_jiffies(int m)
{
return (m * HZ / MSECS_PER_SEC);
}
int main(int argc, char **argv) {
//int m = atoi(argv[1]);
int m = TIMEOUT;
printf("%lu\n",msecs_to_jiffies(m));
return 0;
}
compiled with:
gcc -O2 -S --verbose-asm test.c
<snip>
main:
.LFB13:
.cfi_startproc
subq $8, %rsp #,
.cfi_def_cfa_offset 16
movl $10, %esi #,
movl $.LC0, %edi #,
xorl %eax, %eax #
call printf #
<snip>
need to cleanup here :)
thx!
hofrat
> Your inline version has not worked with any of
> x86-64 gcc 4.4, 4.6, 4.7, or 4.9
>
> I suggest you add some lines to
> lib/test_module.c/test_module_init like:
>
> unsigned int m;
>
> for (m = 10; m < 200; m += 10)
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> m, msecs_to_jiffies(m));
>
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 10, msecs_to_jiffies(10));
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 100, msecs_to_jiffies(100));
> pr_info("msecs_to_jiffies(%u) is %lu\n",
> 1000, msecs_to_jiffies(1000));
>
> Then it's pretty easy to look at the assembly/.lst file
>
> Your inline function doesn't allow gcc to precompute
> the msecs_to_jiffies value. The macro one does for all
> those gcc versions.
>
> Try it and look at the generated .lst files with and
> without the patch I sent.
>
I have checked it with the testcase you proposed - and I quite sure
it is working, find the .s file snippets from the test_module.c modified
as you suggested below. HZ is set to 300 so that it is easy to differenciate
the parameter to msecs_to_jiffies and the printed const.
there is no call and no calculation for the constant cases - just load as
it should be.
with the patch applied:
test_module_init:
pushq %rbp #
movq %rsp, %rbp #,
pushq %rbx #
movl $10, %ebx #, m
pushq %rcx #
.L4:
movl %ebx, %edi # m,
call __msecs_to_jiffies #
movl %ebx, %esi # m,
movq %rax, %rdx # D.14515,
movq $.LC1, %rdi #,
xorl %eax, %eax #
addl $10, %ebx #, m
call printk #
cmpl $200, %ebx #, m
jne .L4 #, <---end of for loop
movl $3, %edx #, <---msecs_to_jiffies(10)
movl $10, %esi #,
movq $.LC1, %rdi #,
xorl %eax, %eax #
call printk #
movl $30, %edx #, <---msecs_to_jiffies(100)
movl $100, %esi #,
movq $.LC1, %rdi #,
xorl %eax, %eax #
call printk #
movl $300, %edx #, <---msecs_to_jiffies(1000)
movl $1000, %esi #,
movq $.LC1, %rdi #,
xorl %eax, %eax #
call printk #
without the patch applied:
pushq %rbp #
movq %rsp, %rbp #,
pushq %rbx #
movl $10, %ebx #, m
pushq %rcx #
.L2:
movl %ebx, %edi # m,
call msecs_to_jiffies #
movl %ebx, %esi # m,
movq %rax, %rdx # D.14503,
movq $.LC0, %rdi #,
xorl %eax, %eax #
addl $10, %ebx #, m
call printk #
cmpl $200, %ebx #, m
jne .L2 #,
movl $10, %edi #,
call msecs_to_jiffies #
movl $10, %esi #,
movq %rax, %rdx # D.14504,
movq $.LC0, %rdi #,
xorl %eax, %eax #
call printk #
movl $100, %edi #,
call msecs_to_jiffies #
movl $100, %esi #,
movq %rax, %rdx # D.14505,
movq $.LC0, %rdi #,
xorl %eax, %eax #
call printk #
movl $1000, %edi #,
could you check the .s file for you test module ?
I'm a bit lost on why you are not seeing this - also
checked with cross-build (multi_v7_defconfig) and it
looks like thats working as well.
thx!
hofrat