For x86 architecture get_order function can be optimized due to
assembler instruction bsr.
This is second version of patch where for constants gcc precompute the
result.
Signed-off-by: Maksym Planeta <[email protected]>
---
arch/x86/include/asm/getorder.h | 48 +++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/page.h | 2 +-
2 files changed, 49 insertions(+), 1 deletions(-)
create mode 100644 arch/x86/include/asm/getorder.h
diff --git a/arch/x86/include/asm/getorder.h b/arch/x86/include/asm/getorder.h
new file mode 100644
index 0000000..b0c6f57
--- /dev/null
+++ b/arch/x86/include/asm/getorder.h
@@ -0,0 +1,48 @@
+#ifndef __ASM_GENERIC_GETORDER_H
+#define __ASM_GENERIC_GETORDER_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler.h>
+
+#ifdef CONFIG_X86_CMOV
+#define ASM_CMOVZ(op, dest) \
+ "cmovzl %" #op ",%" #dest ";\n\t"
+#else
+#define ASM_CMOVZ(op, dest) \
+ "jnz 1f;\n\t" \
+ "movl %" #op ", %" #dest ";\n\t" \
+ "1: "
+#endif
+
+static __always_inline int __get_order(unsigned long size)
+{
+ int order;
+
+ size = (size - 1) >> (PAGE_SHIFT - 1);
+ asm("bsr %1, %0\n\t"
+ ASM_CMOVZ(2, 0)
+ : "=&r" (order) : "rm" (size), "rm" (0));
+ return order;
+}
+
+/* Pure 2^n version of get_order */
+static inline __attribute_const__ int get_order(unsigned long size)
+{
+ int order;
+
+ if (__builtin_constant_p(size)) {
+ size = (size - 1) >> (PAGE_SHIFT - 1);
+ order = -1;
+ do {
+ size >>= 1;
+ order++;
+ } while (size);
+ return order;
+ }
+ return __get_order(size);
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_GENERIC_GETORDER_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 8ca8283..10e4c45 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -63,7 +63,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#endif /* __ASSEMBLY__ */
#include <asm-generic/memory_model.h>
-#include <asm-generic/getorder.h>
+#include <asm/getorder.h>
#define __HAVE_ARCH_GATE_AREA 1
--
1.7.2.3
* Maksym Planeta <[email protected]> wrote:
> For x86 architecture get_order function can be optimized due to
> assembler instruction bsr.
>
> This is second version of patch where for constants gcc precompute the
> result.
>
> Signed-off-by: Maksym Planeta <[email protected]>
> ---
> arch/x86/include/asm/getorder.h | 48 +++++++++++++++++++++++++++++++++++++++
> arch/x86/include/asm/page.h | 2 +-
> 2 files changed, 49 insertions(+), 1 deletions(-)
> create mode 100644 arch/x86/include/asm/getorder.h
>
> diff --git a/arch/x86/include/asm/getorder.h b/arch/x86/include/asm/getorder.h
> new file mode 100644
> index 0000000..b0c6f57
> --- /dev/null
> +++ b/arch/x86/include/asm/getorder.h
> @@ -0,0 +1,48 @@
> +#ifndef __ASM_GENERIC_GETORDER_H
> +#define __ASM_GENERIC_GETORDER_H
> +
> +#ifndef __ASSEMBLY__
> +
> +#include <linux/compiler.h>
> +
> +#ifdef CONFIG_X86_CMOV
> +#define ASM_CMOVZ(op, dest) \
> + "cmovzl %" #op ",%" #dest ";\n\t"
> +#else
> +#define ASM_CMOVZ(op, dest) \
> + "jnz 1f;\n\t" \
> + "movl %" #op ", %" #dest ";\n\t" \
> + "1: "
> +#endif
> +
> +static __always_inline int __get_order(unsigned long size)
> +{
> + int order;
> +
> + size = (size - 1) >> (PAGE_SHIFT - 1);
> + asm("bsr %1, %0\n\t"
> + ASM_CMOVZ(2, 0)
> + : "=&r" (order) : "rm" (size), "rm" (0));
> + return order;
> +}
> +
> +/* Pure 2^n version of get_order */
> +static inline __attribute_const__ int get_order(unsigned long size)
> +{
> + int order;
> +
> + if (__builtin_constant_p(size)) {
> + size = (size - 1) >> (PAGE_SHIFT - 1);
> + order = -1;
> + do {
> + size >>= 1;
> + order++;
> + } while (size);
> + return order;
> + }
> + return __get_order(size);
> +}
> +
> +#endif /* __ASSEMBLY__ */
> +
> +#endif /* __ASM_GENERIC_GETORDER_H */
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 8ca8283..10e4c45 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -63,7 +63,7 @@ extern bool __virt_addr_valid(unsigned long kaddr);
> #endif /* __ASSEMBLY__ */
>
> #include <asm-generic/memory_model.h>
> -#include <asm-generic/getorder.h>
> +#include <asm/getorder.h>
Just wondering, what's the before/after 'size vmlinux' effect on a 'make
defconfig' x86 kernel? Does the optimization make the kernel smaller as well,
besides making it faster?
Thanks,
Ingo
Am Sonntag 27 M?rz 2011, 13:33:23 schrieb Ingo Molnar:
>
> Just wondering, what's the before/after 'size vmlinux' effect on a 'make
> defconfig' x86 kernel? Does the optimization make the kernel smaller as
> well, besides making it faster?
>
> Thanks,
>
> Ingo
Hi,
I compiled Linus' Tree (40471856) on my x86_64 machine, once without the patch
and once with the patch.
size vmlinux-with*
text data bss dec hex filename
7996901 1236036 1118208 10351145 9df229 vmlinux-withoutpatch
7996554 1235988 1118208 10350750 9df09e vmlinux-withpatch
ls -la vmlinux-with*
-rwxr-xr-x 1 peter users 18921237 27. M?r 17:40 vmlinux-withoutpatch
-rwxr-xr-x 1 peter users 18921333 27. M?r 17:58 vmlinux-withpatch
Peter
On Sat, 27/03/2011 at 13:33 +0200, Ingo Molnar wrote:
> Just wondering, what's the before/after 'size vmlinux' effect on a 'make
> defconfig' x86 kernel? Does the optimization make the kernel smaller as well,
> besides making it faster?
Thank you for advice. I didn't really mentioned it. So without my patch:
size vmlinux
text data bss dec hex filename
7915025 1253060 1122304 10290389 9d04d5 vmlinux
And with it:
size vmlinux
text data bss dec hex filename
7919150 1251364 1122304 10292818 9d0e52 vmlinux
Size increased. But I discovered that if I replace "inline" with
"__always_inline" in get_order(), size will be following:
size vmlinux
text data bss dec hex filename
7914481 1249252 1122304 10286037 9cf3d5 vmlinux
And this is less than with same modification in asm-general:
size vmlinux
text data bss dec hex filename
7914713 1249268 1122304 10286285 9cf4cd vmlinux
With my patch and "__always_inline" instead of just "inline" size will
be the smallest.
Signed-off-by: Maksym Planeta <[email protected]>
---
arch/x86/include/asm/getorder.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/include/asm/getorder.h
b/arch/x86/include/asm/getorder.h
index b0c6f57..6220783 100644
--- a/arch/x86/include/asm/getorder.h
+++ b/arch/x86/include/asm/getorder.h
@@ -27,7 +27,7 @@ static __always_inline int __get_order(unsigned long
size)
}
/* Pure 2^n version of get_order */
-static inline __attribute_const__ int get_order(unsigned long size)
+static __always_inline __attribute_const__ int get_order(unsigned long
size)
{
int order;
--
Thanks,
Maksym Planeta
* Maksym Planeta <[email protected]> wrote:
> On Sat, 27/03/2011 at 13:33 +0200, Ingo Molnar wrote:
> > Just wondering, what's the before/after 'size vmlinux' effect on a 'make
> > defconfig' x86 kernel? Does the optimization make the kernel smaller as well,
> > besides making it faster?
>
> Thank you for advice. I didn't really mentioned it. So without my patch:
>
> size vmlinux
> text data bss dec hex filename
> 7915025 1253060 1122304 10290389 9d04d5 vmlinux
>
> And with it:
>
> size vmlinux
> text data bss dec hex filename
> 7919150 1251364 1122304 10292818 9d0e52 vmlinux
>
> Size increased. But I discovered that if I replace "inline" with
> "__always_inline" in get_order(), size will be following:
>
> size vmlinux
> text data bss dec hex filename
> 7914481 1249252 1122304 10286037 9cf3d5 vmlinux
>
> And this is less than with same modification in asm-general:
>
> size vmlinux
> text data bss dec hex filename
> 7914713 1249268 1122304 10286285 9cf4cd vmlinux
>
> With my patch and "__always_inline" instead of just "inline" size will
> be the smallest.
Weird, that's an unexpected resut.
Have you looked at the disassembly, why does the size increase? I'd expect such
a straight assembly optimization to result in smaller code: in the non-constant
case it should be the same size as before, in the constant case it should be
smaller, because BSR should be smaller than an open-coded search loop, right?
One sidenote, defconfig turns these on:
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_OPTIMIZE_INLINING=y
And some versions of GCC arent very good with these.
Thanks,
Ingo
On Mon, 28/03/2011 at 07:08 +0200, Ingo Molnar wrote:
> Have you looked at the disassembly, why does the size increase? I'd expect such
> a straight assembly optimization to result in smaller code: in the non-constant
> case it should be the same size as before, in the constant case it should be
> smaller, because BSR should be smaller than an open-coded search loop, right?
Here is disassembly of patched get_order() with "inline" from
"kernel/kexec.c":
a6c: 48 8b 5d c8 mov -0x38(%rbp),%rbx
a70: e8 0b fd ff ff callq 780 <get_order.clone.7>
0000000000000780 <get_order.clone.7>:
780: 55 push %rbp
781: b8 01 00 00 00 mov $0x1,%eax
786: 48 89 e5 mov %rsp,%rbp
789: c9 leaveq
78a: c3 retq
My version of gcc is gcc (Debian 4.5.2-4) 4.5.2, probably I should
upgrade my gcc version for better inline expansions.
--
Thanks,
Maksym Planeta
On 03/28/2011 12:33 PM, Maksym Planeta wrote:
>
> Here is disassembly of patched get_order() with "inline" from
> "kernel/kexec.c":
>
> a6c: 48 8b 5d c8 mov -0x38(%rbp),%rbx
> a70: e8 0b fd ff ff callq 780 <get_order.clone.7>
>
> 0000000000000780 <get_order.clone.7>:
> 780: 55 push %rbp
> 781: b8 01 00 00 00 mov $0x1,%eax
> 786: 48 89 e5 mov %rsp,%rbp
> 789: c9 leaveq
> 78a: c3 retq
>
> My version of gcc is gcc (Debian 4.5.2-4) 4.5.2, probably I should
> upgrade my gcc version for better inline expansions.
>
With what options?
-hpa
On 03/27/2011 01:45 AM, Maksym Planeta wrote:
> For x86 architecture get_order function can be optimized due to
> assembler instruction bsr.
>
> This is second version of patch where for constants gcc precompute the
> result.
>
> Signed-off-by: Maksym Planeta <[email protected]>
gcc 4.x has an intrinsic, __builtin_clz(), which does the opposite of
the bsr instruction; specifically:
__builtin_clz(x) ^ 31
... generates a bsrl instruction if x is variable. This tends to
generate much better code than any assembly hacks.
-hpa
* H. Peter Anvin <[email protected]> wrote:
> On 03/27/2011 01:45 AM, Maksym Planeta wrote:
> > For x86 architecture get_order function can be optimized due to
> > assembler instruction bsr.
> >
> > This is second version of patch where for constants gcc precompute the
> > result.
> >
> > Signed-off-by: Maksym Planeta <[email protected]>
>
> gcc 4.x has an intrinsic, __builtin_clz(), which does the opposite of
> the bsr instruction; specifically:
>
> __builtin_clz(x) ^ 31
>
> ... generates a bsrl instruction if x is variable. This tends to
> generate much better code than any assembly hacks.
Indeed, that should work better and should be tried - and it can probably
propagate the flags result sensibly (which GCC's asm() cannot, unfortunately).
Thanks,
Ingo
On Mon, 28/03/2011 at 12:44 -0700, H. Peter Anvin wrote:
> With what options?
>
> -hpa
$ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.5.2/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.5.2-4'
--with-bugurl=file:///usr/share/doc/gcc-4.5/README.Bugs
--enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr
--program-suffix=-4.5 --enable-shared --enable-multiarch
--enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib
--without-included-gettext --enable-threads=posix
--with-gxx-include-dir=/usr/include/c++/4.5 --libdir=/usr/lib
--enable-nls --enable-clocale=gnu --enable-libstdcxx-debug
--enable-libstdcxx-time=yes --enable-plugin --enable-gold
--enable-ld=default --with-plugin-ld=ld.gold --enable-objc-gc
--with-arch-32=i586 --with-tune=generic --enable-checking=release
--build=x86_64-linux-gnu --host=x86_64-linux-gnu
--target=x86_64-linux-gnu
Thread model: posix
gcc version 4.5.2 (Debian 4.5.2-4)
for compiling kernel I've used: make defconfig && make -j3
--
Thanks,
Maksym Planeta