LinuxLists.cc - [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

2013-06-28 10:54:40

Subject: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

The new implementation allows the compiler to better optimize the code; the
original implementation is still used when the kernel is compiled with older
versions of gcc that don't support asm-goto.

Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
path taking 12 instructions.

The original mutex_unlock() is 24 bytes with the fast path taking 7
instructions; the new mutex_unlock() is 25 bytes (because the compiler used
a 2-byte ret) with the fast path taking 4 instructions.

The two versions of the functions are included below for reference.

Old:
ffffffff817742a0 <mutex_lock>:
ffffffff817742a0: 55 push %rbp
ffffffff817742a1: 48 89 e5 mov %rsp,%rbp
ffffffff817742a4: 48 83 ec 10 sub $0x10,%rsp
ffffffff817742a8: 48 89 5d f0 mov %rbx,-0x10(%rbp)
ffffffff817742ac: 48 89 fb mov %rdi,%rbx
ffffffff817742af: 4c 89 65 f8 mov %r12,-0x8(%rbp)
ffffffff817742b3: e8 28 15 00 00 callq ffffffff817757e0 <_cond_resched>
ffffffff817742b8: 48 89 df mov %rbx,%rdi
ffffffff817742bb: f0 ff 0f lock decl (%rdi)
ffffffff817742be: 79 05 jns ffffffff817742c5 <mutex_lock+0x25>
ffffffff817742c0: e8 cb 04 00 00 callq ffffffff81774790 <__mutex_lock_slowpath>
ffffffff817742c5: 65 48 8b 04 25 c0 b7 mov %gs:0xb7c0,%rax
ffffffff817742cc: 00 00
ffffffff817742ce: 4c 8b 65 f8 mov -0x8(%rbp),%r12
ffffffff817742d2: 48 89 43 18 mov %rax,0x18(%rbx)
ffffffff817742d6: 48 8b 5d f0 mov -0x10(%rbp),%rbx
ffffffff817742da: c9 leaveq
ffffffff817742db: c3 retq

ffffffff81774250 <mutex_unlock>:
ffffffff81774250: 55 push %rbp
ffffffff81774251: 48 c7 47 18 00 00 00 movq $0x0,0x18(%rdi)
ffffffff81774258: 00
ffffffff81774259: 48 89 e5 mov %rsp,%rbp
ffffffff8177425c: f0 ff 07 lock incl (%rdi)
ffffffff8177425f: 7f 05 jg ffffffff81774266 <mutex_unlock+0x16>
ffffffff81774261: e8 ea 04 00 00 callq ffffffff81774750 <__mutex_unlock_slowpath>
ffffffff81774266: 5d pop %rbp
ffffffff81774267: c3 retq

New:
ffffffff81774920 <mutex_lock>:
ffffffff81774920: 55 push %rbp
ffffffff81774921: 48 89 e5 mov %rsp,%rbp
ffffffff81774924: 53 push %rbx
ffffffff81774925: 48 89 fb mov %rdi,%rbx
ffffffff81774928: e8 a3 0e 00 00 callq ffffffff817757d0 <_cond_resched>
ffffffff8177492d: f0 ff 0b lock decl (%rbx)
ffffffff81774930: 79 08 jns ffffffff8177493a <mutex_lock+0x1a>
ffffffff81774932: 48 89 df mov %rbx,%rdi
ffffffff81774935: e8 16 fe ff ff callq ffffffff81774750 <__mutex_lock_slowpath>
ffffffff8177493a: 65 48 8b 04 25 c0 b7 mov %gs:0xb7c0,%rax
ffffffff81774941: 00 00
ffffffff81774943: 48 89 43 18 mov %rax,0x18(%rbx)
ffffffff81774947: 5b pop %rbx
ffffffff81774948: 5d pop %rbp
ffffffff81774949: c3 retq

ffffffff81774730 <mutex_unlock>:
ffffffff81774730: 48 c7 47 18 00 00 00 movq $0x0,0x18(%rdi)
ffffffff81774737: 00
ffffffff81774738: f0 ff 07 lock incl (%rdi)
ffffffff8177473b: 7f 0a jg ffffffff81774747 <mutex_unlock+0x17>
ffffffff8177473d: 55 push %rbp
ffffffff8177473e: 48 89 e5 mov %rsp,%rbp
ffffffff81774741: e8 aa ff ff ff callq ffffffff817746f0 <__mutex_unlock_slowpath>
ffffffff81774746: 5d pop %rbp
ffffffff81774747: f3 c3 repz retq

Signed-off-by: Wedson Almeida Filho <[email protected]>
---
arch/x86/include/asm/mutex_64.h | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
index 68a87b0..ac3517b 100644
--- a/arch/x86/include/asm/mutex_64.h
+++ b/arch/x86/include/asm/mutex_64.h
@@ -16,6 +16,20 @@
*
* Atomically decrements @v and calls <fail_fn> if the result is negative.
*/
+#if defined(CC_HAVE_ASM_GOTO)
+static inline void __mutex_fastpath_lock(atomic_t *v,
+ void (*fail_fn)(atomic_t *))
+{
+ asm volatile goto(LOCK_PREFIX " decl %0\n"
+ " jns %l[exit]\n"
+ : : "m" (v->counter)
+ : "memory", "cc"
+ : exit);
+ fail_fn(v);
+exit:
+ return;
+}
+#else
#define __mutex_fastpath_lock(v, fail_fn) \
do { \
unsigned long dummy; \
@@ -32,6 +46,7 @@ do { \
: "rax", "rsi", "rdx", "rcx", \
"r8", "r9", "r10", "r11", "memory"); \
} while (0)
+#endif

/**
* __mutex_fastpath_lock_retval - try to take the lock by moving the count
@@ -59,6 +74,20 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count,
*
* Atomically increments @v and calls <fail_fn> if the result is nonpositive.
*/
+#if defined(CC_HAVE_ASM_GOTO)
+static inline void __mutex_fastpath_unlock(atomic_t *v,
+ void (*fail_fn)(atomic_t *))
+{
+ asm volatile goto(LOCK_PREFIX " incl %0\n"
+ " jg %l[exit]\n"
+ : : "m" (v->counter)
+ : "memory", "cc"
+ : exit);
+ fail_fn(v);
+exit:
+ return;
+}
+#else
#define __mutex_fastpath_unlock(v, fail_fn) \
do { \
unsigned long dummy; \
@@ -75,6 +104,7 @@ do { \
: "rax", "rsi", "rdx", "rcx", \
"r8", "r9", "r10", "r11", "memory"); \
} while (0)
+#endif

#define __mutex_slowpath_needs_to_unlock() 1

--
1.7.9.5

2013-06-28 11:19:56

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

* Wedson Almeida Filho <[email protected]> wrote:

> The new implementation allows the compiler to better optimize the code; the
> original implementation is still used when the kernel is compiled with older
> versions of gcc that don't support asm-goto.
>
> Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
> path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
> path taking 12 instructions.
>
> The original mutex_unlock() is 24 bytes with the fast path taking 7
> instructions; the new mutex_unlock() is 25 bytes (because the compiler used
> a 2-byte ret) with the fast path taking 4 instructions.
>
> The two versions of the functions are included below for reference.
>
> Old:
> ffffffff817742a0 <mutex_lock>:
> ffffffff817742a0: 55 push %rbp
> ffffffff817742a1: 48 89 e5 mov %rsp,%rbp
> ffffffff817742a4: 48 83 ec 10 sub $0x10,%rsp
> ffffffff817742a8: 48 89 5d f0 mov %rbx,-0x10(%rbp)
> ffffffff817742ac: 48 89 fb mov %rdi,%rbx
> ffffffff817742af: 4c 89 65 f8 mov %r12,-0x8(%rbp)
> ffffffff817742b3: e8 28 15 00 00 callq ffffffff817757e0 <_cond_resched>
> ffffffff817742b8: 48 89 df mov %rbx,%rdi
> ffffffff817742bb: f0 ff 0f lock decl (%rdi)
> ffffffff817742be: 79 05 jns ffffffff817742c5 <mutex_lock+0x25>
> ffffffff817742c0: e8 cb 04 00 00 callq ffffffff81774790 <__mutex_lock_slowpath>
> ffffffff817742c5: 65 48 8b 04 25 c0 b7 mov %gs:0xb7c0,%rax
> ffffffff817742cc: 00 00
> ffffffff817742ce: 4c 8b 65 f8 mov -0x8(%rbp),%r12
> ffffffff817742d2: 48 89 43 18 mov %rax,0x18(%rbx)
> ffffffff817742d6: 48 8b 5d f0 mov -0x10(%rbp),%rbx
> ffffffff817742da: c9 leaveq
> ffffffff817742db: c3 retq
>
> ffffffff81774250 <mutex_unlock>:
> ffffffff81774250: 55 push %rbp
> ffffffff81774251: 48 c7 47 18 00 00 00 movq $0x0,0x18(%rdi)
> ffffffff81774258: 00
> ffffffff81774259: 48 89 e5 mov %rsp,%rbp
> ffffffff8177425c: f0 ff 07 lock incl (%rdi)
> ffffffff8177425f: 7f 05 jg ffffffff81774266 <mutex_unlock+0x16>
> ffffffff81774261: e8 ea 04 00 00 callq ffffffff81774750 <__mutex_unlock_slowpath>
> ffffffff81774266: 5d pop %rbp
> ffffffff81774267: c3 retq
>
> New:
> ffffffff81774920 <mutex_lock>:
> ffffffff81774920: 55 push %rbp
> ffffffff81774921: 48 89 e5 mov %rsp,%rbp
> ffffffff81774924: 53 push %rbx
> ffffffff81774925: 48 89 fb mov %rdi,%rbx
> ffffffff81774928: e8 a3 0e 00 00 callq ffffffff817757d0 <_cond_resched>
> ffffffff8177492d: f0 ff 0b lock decl (%rbx)
> ffffffff81774930: 79 08 jns ffffffff8177493a <mutex_lock+0x1a>
> ffffffff81774932: 48 89 df mov %rbx,%rdi
> ffffffff81774935: e8 16 fe ff ff callq ffffffff81774750 <__mutex_lock_slowpath>
> ffffffff8177493a: 65 48 8b 04 25 c0 b7 mov %gs:0xb7c0,%rax
> ffffffff81774941: 00 00
> ffffffff81774943: 48 89 43 18 mov %rax,0x18(%rbx)
> ffffffff81774947: 5b pop %rbx
> ffffffff81774948: 5d pop %rbp
> ffffffff81774949: c3 retq
>
> ffffffff81774730 <mutex_unlock>:
> ffffffff81774730: 48 c7 47 18 00 00 00 movq $0x0,0x18(%rdi)
> ffffffff81774737: 00
> ffffffff81774738: f0 ff 07 lock incl (%rdi)
> ffffffff8177473b: 7f 0a jg ffffffff81774747 <mutex_unlock+0x17>
> ffffffff8177473d: 55 push %rbp
> ffffffff8177473e: 48 89 e5 mov %rsp,%rbp
> ffffffff81774741: e8 aa ff ff ff callq ffffffff817746f0 <__mutex_unlock_slowpath>
> ffffffff81774746: 5d pop %rbp
> ffffffff81774747: f3 c3 repz retq

Very nice!

One detail:

> +#if defined(CC_HAVE_ASM_GOTO)
> +#if defined(CC_HAVE_ASM_GOTO)

Please change these to #ifdef.

Thanks,

Ingo

2013-06-28 14:09:45

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Fri, Jun 28, 2013 at 01:19:48PM +0200, Ingo Molnar wrote:
>
> * Wedson Almeida Filho <[email protected]> wrote:
>
> > The new implementation allows the compiler to better optimize the code; the
> > original implementation is still used when the kernel is compiled with older
> > versions of gcc that don't support asm-goto.
> >
> > Compiling with gcc 4.7.3, the original mutex_lock() is 60 bytes with the fast
> > path taking 16 instructions; the new mutex_lock() is 42 bytes, with the fast
> > path taking 12 instructions.
> >
> > The original mutex_unlock() is 24 bytes with the fast path taking 7
> > instructions; the new mutex_unlock() is 25 bytes (because the compiler used
> > a 2-byte ret) with the fast path taking 4 instructions.
> >
> > The two versions of the functions are included below for reference.

Btw, do we have any perf data showing any improvements from this patch?

[ … ]

> One detail:
>
> > +#if defined(CC_HAVE_ASM_GOTO)
> > +#if defined(CC_HAVE_ASM_GOTO)
>
> Please change these to #ifdef.

Cool - so we have a scripts/gcc-goto.sh which checks for support
for asm goto.

Our testing for asm goto otherwise is a bit more, hmm, hands-on in
arch/x86/include/asm/cpufeature.h:

#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5

Maybe I should change that to the more explicit CC_HAVE_ASM_GOTO then.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-06-28 14:12:59

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On 06/28/2013 07:09 AM, Borislav Petkov wrote:
>
> Our testing for asm goto otherwise is a bit more, hmm, hands-on in
> arch/x86/include/asm/cpufeature.h:
>
> #if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
>
> Maybe I should change that to the more explicit CC_HAVE_ASM_GOTO then.
>

We should... we didn't have the asm goto test at the time that code was
written, but I think the tracing guys added it either because some
distros backported asm goto or to deal with it on non-x86 architectures.

-hpa

2013-06-28 15:15:47

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Fri, Jun 28, 2013 at 07:12:18AM -0700, H. Peter Anvin wrote:
> On 06/28/2013 07:09 AM, Borislav Petkov wrote:
> >
> > Our testing for asm goto otherwise is a bit more, hmm, hands-on in
> > arch/x86/include/asm/cpufeature.h:
> >
> > #if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
> >
> > Maybe I should change that to the more explicit CC_HAVE_ASM_GOTO then.
> >
>
> We should... we didn't have the asm goto test at the time that code was
> written, but I think the tracing guys added it either because some
> distros backported asm goto or to deal with it on non-x86 architectures.

gcc backport in distro iirc

2013-06-28 16:41:48

by Borislav Petkov

[permalink] [raw]

Subject: [PATCH] x86, cpufeature: Use new CC_HAVE_ASM_GOTO

From: Borislav Petkov <[email protected]>

... for checking for "asm goto" compiler support. It is more explicit
this way and we cover the cases where distros have backported that
support even to gcc versions < 4.5.

Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 47538a61c91b..d3f5c63078d8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit);
*/
static __always_inline __pure bool __static_cpu_has(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO

#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
+
/*
* Catch too early usage of this before alternatives
* have run.
@@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
+
#endif

asm goto("1: jmp %l[t_no]\n"
@@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
warn_pre_alternatives();
return false;
#endif
-#else /* GCC_VERSION >= 40500 */
+
+#else /* CC_HAVE_ASM_GOTO */
+
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $0,%0\n"
@@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
: "=qm" (flag) : "i" (bit));
return flag;
-#endif
+
+#endif /* CC_HAVE_ASM_GOTO */
}

#define static_cpu_has(bit) \
@@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)

static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO
/*
* We need to spell the jumps to the compiler because, depending on the offset,
* the replacement jump can be bigger than the original jump, and this we cannot
@@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
return false;
t_dynamic:
return __static_cpu_has_safe(bit);
-#else /* GCC_VERSION >= 40500 */
+#else
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $2,%0\n"
@@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
: "=qm" (flag)
: "i" (bit), "i" (X86_FEATURE_ALWAYS));
return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
-#endif
+#endif /* CC_HAVE_ASM_GOTO */
}

#define static_cpu_has_safe(bit) \
--
1.8.3

2013-06-29 23:56:32

by Wedson Almeida Filho

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Fri, Jun 28, 2013 at 7:09 AM, Borislav Petkov <[email protected]> wrote:

> Btw, do we have any perf data showing any improvements from this patch?

I wrote a simple test the measures the time it takes to acquire and
release an uncontended mutex (i.e., we always take the fast path)
100k times. I ran it a few times, the original code averages
2.743436ms, and the new code averages 2.101098ms, so it's about 23% improvement.

I also think the code looks cleaner this way.

2013-06-30 22:00:14

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Sat, Jun 29, 2013 at 04:56:30PM -0700, Wedson Almeida Filho wrote:
> On Fri, Jun 28, 2013 at 7:09 AM, Borislav Petkov <[email protected]> wrote:
>
> > Btw, do we have any perf data showing any improvements from this patch?
>
> I wrote a simple test the measures the time it takes to acquire and
> release an uncontended mutex (i.e., we always take the fast path)
> 100k times. I ran it a few times, the original code averages
> 2.743436ms, and the new code averages 2.101098ms, so it's about 23% improvement.

Microbenchmark results tend to be misleading in such situations. Rather,
it would be much closer to reality if you traced a real workload like a
simple kernel build, for example, with and without your patch.

I.e., something like

perf stat --repeat 5 ./build-kernel.sh

and take a look at what the perfcouters are saying in both cases.

> I also think the code looks cleaner this way.

No doubt.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 07:50:51

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

* Borislav Petkov <[email protected]> wrote:

> On Sat, Jun 29, 2013 at 04:56:30PM -0700, Wedson Almeida Filho wrote:
> > On Fri, Jun 28, 2013 at 7:09 AM, Borislav Petkov <[email protected]> wrote:
> >
> > > Btw, do we have any perf data showing any improvements from this patch?
> >
> > I wrote a simple test the measures the time it takes to acquire and
> > release an uncontended mutex (i.e., we always take the fast path)
> > 100k times. I ran it a few times, the original code averages
> > 2.743436ms, and the new code averages 2.101098ms, so it's about 23% improvement.
>
> Microbenchmark results tend to be misleading in such situations. Rather,
> it would be much closer to reality if you traced a real workload like a
> simple kernel build, for example, with and without your patch.

Not sure - the main thing we want to know is whether it gets faster. The
_amount_ will depend on things like precise usage patterns, caching, etc.
- but rarely does a real workload turn a win like this into a loss.

> I.e., something like
>
> perf stat --repeat 5 ./build-kernel.sh
>
> and take a look at what the perfcouters are saying in both cases.

Hm, the noise of such a workload will very likely drown out improvements
that are in the cycle scale.

Thanks,

Ingo

2013-07-01 10:23:15

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 09:50:46AM +0200, Ingo Molnar wrote:
> Not sure - the main thing we want to know is whether it gets faster.
> The _amount_ will depend on things like precise usage patterns,
> caching, etc. - but rarely does a real workload turn a win like this
> into a loss.

Yep, and it does get faster by a whopping 6 seconds!

Almost all standard counters go down a bit.

Interestingly, branch misses get a slight increase and the asm goto
thing does actually jump to the fail_fn from within the asm so maybe
this could puzzle the branch predictor a bit. Although the instructions
look the same and jumps are both forward.

Oh well, we don't know where those additional misses happened so it
could be somewhere else entirely, or it is simply noise.

In any case, we're getting faster, so not worth investigating I guess.

plain 3.10
==========

Performance counter stats for '../build-kernel.sh' (5 runs):

1312558.712266 task-clock # 5.961 CPUs utilized ( +- 0.02% )
1,036,629 context-switches # 0.790 K/sec ( +- 0.24% )
55,118 cpu-migrations # 0.042 K/sec ( +- 0.25% )
46,505,184 page-faults # 0.035 M/sec ( +- 0.00% )
4,768,420,289,997 cycles # 3.633 GHz ( +- 0.02% ) [83.79%]
3,424,161,066,397 stalled-cycles-frontend # 71.81% frontend cycles idle ( +- 0.02% ) [83.78%]
2,483,143,574,419 stalled-cycles-backend # 52.07% backend cycles idle ( +- 0.04% ) [67.40%]
3,091,612,061,933 instructions # 0.65 insns per cycle
# 1.11 stalled cycles per insn ( +- 0.01% ) [83.93%]
677,787,215,988 branches # 516.386 M/sec ( +- 0.01% ) [83.77%]
25,438,736,368 branch-misses # 3.75% of all branches ( +- 0.02% ) [83.78%]

220.191740778 seconds time elapsed ( +- 0.32% )

+ patch
========

Performance counter stats for '../build-kernel.sh' (5 runs):

1309995.427337 task-clock # 6.106 CPUs utilized ( +- 0.09% )
1,033,446 context-switches # 0.789 K/sec ( +- 0.23% )
55,228 cpu-migrations # 0.042 K/sec ( +- 0.28% )
46,484,992 page-faults # 0.035 M/sec ( +- 0.00% )
4,759,631,961,013 cycles # 3.633 GHz ( +- 0.09% ) [83.78%]
3,415,933,806,156 stalled-cycles-frontend # 71.77% frontend cycles idle ( +- 0.12% ) [83.78%]
2,476,066,765,933 stalled-cycles-backend # 52.02% backend cycles idle ( +- 0.10% ) [67.38%]
3,089,317,073,397 instructions # 0.65 insns per cycle
# 1.11 stalled cycles per insn ( +- 0.02% ) [83.95%]
677,623,252,827 branches # 517.271 M/sec ( +- 0.01% ) [83.79%]
25,444,376,740 branch-misses # 3.75% of all branches ( +- 0.02% ) [83.79%]

214.533868029 seconds time elapsed ( +- 0.36% )

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 11:11:28

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

* Borislav Petkov <[email protected]> wrote:

> On Mon, Jul 01, 2013 at 09:50:46AM +0200, Ingo Molnar wrote:
> > Not sure - the main thing we want to know is whether it gets faster.
> > The _amount_ will depend on things like precise usage patterns,
> > caching, etc. - but rarely does a real workload turn a win like this
> > into a loss.
>
> Yep, and it does get faster by a whopping 6 seconds!
>
> Almost all standard counters go down a bit.
>
> Interestingly, branch misses get a slight increase and the asm goto
> thing does actually jump to the fail_fn from within the asm so maybe
> this could puzzle the branch predictor a bit. Although the instructions
> look the same and jumps are both forward.
>
> Oh well, we don't know where those additional misses happened so it
> could be somewhere else entirely, or it is simply noise.
>
> In any case, we're getting faster, so not worth investigating I guess.
>
>
> plain 3.10
> ==========
>
> Performance counter stats for '../build-kernel.sh' (5 runs):
>
> 1312558.712266 task-clock # 5.961 CPUs utilized ( +- 0.02% )
> 1,036,629 context-switches # 0.790 K/sec ( +- 0.24% )
> 55,118 cpu-migrations # 0.042 K/sec ( +- 0.25% )
> 46,505,184 page-faults # 0.035 M/sec ( +- 0.00% )
> 4,768,420,289,997 cycles # 3.633 GHz ( +- 0.02% ) [83.79%]
> 3,424,161,066,397 stalled-cycles-frontend # 71.81% frontend cycles idle ( +- 0.02% ) [83.78%]
> 2,483,143,574,419 stalled-cycles-backend # 52.07% backend cycles idle ( +- 0.04% ) [67.40%]
> 3,091,612,061,933 instructions # 0.65 insns per cycle
> # 1.11 stalled cycles per insn ( +- 0.01% ) [83.93%]
> 677,787,215,988 branches # 516.386 M/sec ( +- 0.01% ) [83.77%]
> 25,438,736,368 branch-misses # 3.75% of all branches ( +- 0.02% ) [83.78%]
>
> 220.191740778 seconds time elapsed ( +- 0.32% )
>
> + patch
> ========
>
> Performance counter stats for '../build-kernel.sh' (5 runs):
>
> 1309995.427337 task-clock # 6.106 CPUs utilized ( +- 0.09% )
> 1,033,446 context-switches # 0.789 K/sec ( +- 0.23% )
> 55,228 cpu-migrations # 0.042 K/sec ( +- 0.28% )
> 46,484,992 page-faults # 0.035 M/sec ( +- 0.00% )
> 4,759,631,961,013 cycles # 3.633 GHz ( +- 0.09% ) [83.78%]
> 3,415,933,806,156 stalled-cycles-frontend # 71.77% frontend cycles idle ( +- 0.12% ) [83.78%]
> 2,476,066,765,933 stalled-cycles-backend # 52.02% backend cycles idle ( +- 0.10% ) [67.38%]
> 3,089,317,073,397 instructions # 0.65 insns per cycle
> # 1.11 stalled cycles per insn ( +- 0.02% ) [83.95%]
> 677,623,252,827 branches # 517.271 M/sec ( +- 0.01% ) [83.79%]
> 25,444,376,740 branch-misses # 3.75% of all branches ( +- 0.02% ) [83.79%]
>
> 214.533868029 seconds time elapsed ( +- 0.36% )

Hm, a 6 seconds win looks _way_ too much - we don't execute that much
mutex code, let alone a portion of it.

This could perhaps be a bootup-to-bootup cache layout systematic jitter
artifact, which isn't captured by stddev observations?

Doing something like this with a relatively fresh version of perf:

perf stat --repeat 10 -a --sync \
--pre 'make -s O=defconfig-build/ clean; echo 1 > /proc/sys/vm/drop_caches' \
make -s -j64 O=defconfig-build/ bzImage

... might do the trick (untested!). (Also note the use of -a: this should
run on an otherwise quiescent system.)

As a sidenote, we could add this as a convenience feature, triggered via:

perf stat --flush-vm-caches

... or so, in addition to the already existing --sync option.

Thanks,

Ingo

2013-07-01 12:30:05

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 01:11:22PM +0200, Ingo Molnar wrote:
> Hm, a 6 seconds win looks _way_ too much - we don't execute that much
> mutex code, let alone a portion of it.
>
> This could perhaps be a bootup-to-bootup cache layout systematic jitter
> artifact, which isn't captured by stddev observations?
>
> Doing something like this with a relatively fresh version of perf:
>
> perf stat --repeat 10 -a --sync \
> --pre 'make -s O=defconfig-build/ clean; echo 1 > /proc/sys/vm/drop_caches' \
> make -s -j64 O=defconfig-build/ bzImage
>
> ... might do the trick (untested!). (Also note the use of -a: this should
> run on an otherwise quiescent system.)

Yep, I didn't run -a since I wanted to trace only the build process.
Btw, the build-kernel.sh script looks like this:

#!/bin/bash

NUM_CPUS=$(cat /proc/cpuinfo | grep processor | wc -l)
MAKE_OPTS=-j$(($NUM_CPUS+1))

echo 3 > /proc/sys/vm/drop_caches
make $MAKE_OPTS mrproper
make $MAKE_OPTS oldconfig
make $MAKE_OPTS
<EOF>

Let me try your perf tracing variant.

> As a sidenote, we could add this as a convenience feature, triggered via:
>
> perf stat --flush-vm-caches
>
> ... or so, in addition to the already existing --sync option.

Is this something which we want to use a lot? Also, there's 1, 2 and 3
as arg to drop_caches:

drop_caches

Writing to this will cause the kernel to drop clean caches, dentries and
inodes from memory, causing that memory to become free.

To free pagecache:
echo 1 > /proc/sys/vm/drop_caches
To free dentries and inodes:
echo 2 > /proc/sys/vm/drop_caches
To free pagecache, dentries and inodes:
echo 3 > /proc/sys/vm/drop_caches

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 12:50:51

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

* Borislav Petkov <[email protected]> wrote:

> On Mon, Jul 01, 2013 at 01:11:22PM +0200, Ingo Molnar wrote:
> > Hm, a 6 seconds win looks _way_ too much - we don't execute that much
> > mutex code, let alone a portion of it.
> >
> > This could perhaps be a bootup-to-bootup cache layout systematic jitter
> > artifact, which isn't captured by stddev observations?
> >
> > Doing something like this with a relatively fresh version of perf:
> >
> > perf stat --repeat 10 -a --sync \
> > --pre 'make -s O=defconfig-build/ clean; echo 1 > /proc/sys/vm/drop_caches' \
> > make -s -j64 O=defconfig-build/ bzImage
> >
> > ... might do the trick (untested!). (Also note the use of -a: this should
> > run on an otherwise quiescent system.)
>
> Yep, I didn't run -a since I wanted to trace only the build process.
> Btw, the build-kernel.sh script looks like this:
>
> #!/bin/bash
>
> NUM_CPUS=$(cat /proc/cpuinfo | grep processor | wc -l)
> MAKE_OPTS=-j$(($NUM_CPUS+1))
>
> echo 3 > /proc/sys/vm/drop_caches
> make $MAKE_OPTS mrproper
> make $MAKE_OPTS oldconfig
> make $MAKE_OPTS
> <EOF>
>
> Let me try your perf tracing variant.
>
> > As a sidenote, we could add this as a convenience feature, triggered via:
> >
> > perf stat --flush-vm-caches
> >
> > ... or so, in addition to the already existing --sync option.
>
> Is this something which we want to use a lot? [...]

For cache-cold measurements I'm sure it's handy.

> [...] Also, there's 1, 2 and 3 as arg to drop_caches:
>
> drop_caches
>
> Writing to this will cause the kernel to drop clean caches, dentries and
> inodes from memory, causing that memory to become free.
>
> To free pagecache:
> echo 1 > /proc/sys/vm/drop_caches
> To free dentries and inodes:
> echo 2 > /proc/sys/vm/drop_caches
> To free pagecache, dentries and inodes:
> echo 3 > /proc/sys/vm/drop_caches

Yeah, so it would have to be a --vm-drop-caches <N> option I guess.

Thanks,

Ingo

2013-07-01 14:33:08

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

Unconditional branches don't need prediction. The branch predictor is used for conditional branches and in some hardware designs for indirect branches. Unconditional direct branches never go through the branch predictor simply because the front end can know with 100% certainty where the flow of control will be.

Borislav Petkov <[email protected]> wrote:

>On Mon, Jul 01, 2013 at 09:50:46AM +0200, Ingo Molnar wrote:
>> Not sure - the main thing we want to know is whether it gets faster.
>> The _amount_ will depend on things like precise usage patterns,
>> caching, etc. - but rarely does a real workload turn a win like this
>> into a loss.
>
>Yep, and it does get faster by a whopping 6 seconds!
>
>Almost all standard counters go down a bit.
>
>Interestingly, branch misses get a slight increase and the asm goto
>thing does actually jump to the fail_fn from within the asm so maybe
>this could puzzle the branch predictor a bit. Although the instructions
>look the same and jumps are both forward.
>
>Oh well, we don't know where those additional misses happened so it
>could be somewhere else entirely, or it is simply noise.
>
>In any case, we're getting faster, so not worth investigating I guess.
>
>
>plain 3.10
>==========
>
> Performance counter stats for '../build-kernel.sh' (5 runs):
>
>1312558.712266 task-clock # 5.961 CPUs utilized
> ( +- 0.02% )
>1,036,629 context-switches # 0.790 K/sec
>( +- 0.24% )
>55,118 cpu-migrations # 0.042 K/sec (
>+- 0.25% )
>46,505,184 page-faults # 0.035 M/sec
> ( +- 0.00% )
>4,768,420,289,997 cycles # 3.633 GHz
> ( +- 0.02% ) [83.79%]
>3,424,161,066,397 stalled-cycles-frontend # 71.81% frontend cycles
>idle ( +- 0.02% ) [83.78%]
>2,483,143,574,419 stalled-cycles-backend # 52.07% backend cycles
>idle ( +- 0.04% ) [67.40%]
> 3,091,612,061,933 instructions # 0.65 insns per cycle
> # 1.11 stalled cycles per insn ( +- 0.01% ) [83.93%]
>677,787,215,988 branches # 516.386 M/sec
> ( +- 0.01% ) [83.77%]
>25,438,736,368 branch-misses # 3.75% of all branches
> ( +- 0.02% ) [83.78%]
>
>220.191740778 seconds time elapsed
> ( +- 0.32% )
>
> + patch
>========
>
> Performance counter stats for '../build-kernel.sh' (5 runs):
>
>1309995.427337 task-clock # 6.106 CPUs utilized
> ( +- 0.09% )
>1,033,446 context-switches # 0.789 K/sec
>( +- 0.23% )
>55,228 cpu-migrations # 0.042 K/sec (
>+- 0.28% )
>46,484,992 page-faults # 0.035 M/sec
> ( +- 0.00% )
>4,759,631,961,013 cycles # 3.633 GHz
> ( +- 0.09% ) [83.78%]
>3,415,933,806,156 stalled-cycles-frontend # 71.77% frontend cycles
>idle ( +- 0.12% ) [83.78%]
>2,476,066,765,933 stalled-cycles-backend # 52.02% backend cycles
>idle ( +- 0.10% ) [67.38%]
> 3,089,317,073,397 instructions # 0.65 insns per cycle
> # 1.11 stalled cycles per insn ( +- 0.02% ) [83.95%]
>677,623,252,827 branches # 517.271 M/sec
> ( +- 0.01% ) [83.79%]
>25,444,376,740 branch-misses # 3.75% of all branches
> ( +- 0.02% ) [83.79%]
>
>214.533868029 seconds time elapsed
> ( +- 0.36% )

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2013-07-01 14:36:59

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 07:30:14AM -0700, H. Peter Anvin wrote:
> Unconditional branches don't need prediction. The branch predictor
> is used for conditional branches and in some hardware designs for
> indirect branches. Unconditional direct branches never go through
> the branch predictor simply because the front end can know with 100%
> certainty where the flow of control will be.

I meant the conditional Jcc branches:

ffffffff81774920 <mutex_lock>:

...

ffffffff8177492d: f0 ff 0b lock decl (%rbx)
ffffffff81774930: 79 08 jns ffffffff8177493a <mutex_lock+0x1a>

ffffffff81774730 <mutex_unlock>:

...

ffffffff81774738: f0 ff 07 lock incl (%rdi)
ffffffff8177473b: 7f 0a jg ffffffff81774747 <mutex_unlock+0x17>

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 14:46:40

by H. Peter Anvin

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

Right... brain not awake yet, sorry, and responding from my phone.

I am not to worried about trading off a bit of additional branch mispredicts if everything else wins, but Ingo does have a valid question if we are measuring the right thing.

The effect is definitely big enough that it would be good to understand it one way or the other.

And regardless of the source... nice job!

Borislav Petkov <[email protected]> wrote:

>On Mon, Jul 01, 2013 at 07:30:14AM -0700, H. Peter Anvin wrote:
>> Unconditional branches don't need prediction. The branch predictor
>> is used for conditional branches and in some hardware designs for
>> indirect branches. Unconditional direct branches never go through
>> the branch predictor simply because the front end can know with 100%
>> certainty where the flow of control will be.
>
>I meant the conditional Jcc branches:
>
>ffffffff81774920 <mutex_lock>:
>
>...
>
>ffffffff8177492d: f0 ff 0b lock decl (%rbx)
>ffffffff81774930: 79 08 jns ffffffff8177493a
><mutex_lock+0x1a>
>
>
>
>ffffffff81774730 <mutex_unlock>:
>
>...
>
>ffffffff81774738: f0 ff 07 lock incl (%rdi)
>ffffffff8177473b: 7f 0a jg ffffffff81774747
><mutex_unlock+0x17>

--
Sent from my mobile phone. Please excuse brevity and lack of formatting.

2013-07-01 14:49:00

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 02:50:46PM +0200, Ingo Molnar wrote:
> > Yep, I didn't run -a since I wanted to trace only the build process.
> > Btw, the build-kernel.sh script looks like this:
> >
> > #!/bin/bash
> >
> > NUM_CPUS=$(cat /proc/cpuinfo | grep processor | wc -l)
> > MAKE_OPTS=-j$(($NUM_CPUS+1))
> >
> > echo 3 > /proc/sys/vm/drop_caches
> > make $MAKE_OPTS mrproper
> > make $MAKE_OPTS oldconfig
> > make $MAKE_OPTS
> > <EOF>
> >
> > Let me try your perf tracing variant.

Ok, I removed the output directory to your trace command because it was
complaining about .config missing even if I put it both in the source
tree and in defconfig-build/ so I ended up with:

perf stat --repeat 10 -a --sync --pre 'make -s clean; echo 1 > /proc/sys/vm/drop_caches' make -s -j64 bzImage

which should be basically the same.

And yes, this way we don't see the speedup - numbers are almost the
same. Now on to find out why do I see a speedup with my way of running
the trace.

plain 3.10
==========

Performance counter stats for 'make -s -j64 bzImage' (10 runs):

960155.777354 task-clock # 7.996 CPUs utilized ( +- 0.13% ) [100.00%]
600,993 context-switches # 0.626 K/sec ( +- 0.29% ) [100.00%]
32,764 cpu-migrations # 0.034 K/sec ( +- 0.40% ) [100.00%]
25,449,932 page-faults # 0.027 M/sec ( +- 0.00% )
3,146,429,834,505 cycles # 3.277 GHz ( +- 0.12% ) [83.27%]
2,402,804,186,892 stalled-cycles-frontend # 76.37% frontend cycles idle ( +- 0.10% ) [83.30%]
1,844,806,444,182 stalled-cycles-backend # 58.63% backend cycles idle ( +- 0.16% ) [66.61%]
1,801,184,009,281 instructions # 0.57 insns per cycle
# 1.33 stalled cycles per insn ( +- 0.15% ) [83.27%]
402,482,696,262 branches # 419.185 M/sec ( +- 0.04% ) [83.58%]
17,550,736,725 branch-misses # 4.36% of all branches ( +- 0.09% ) [83.25%]

120.072676076 seconds time elapsed ( +- 0.13% )

+ patch
=======

Performance counter stats for 'make -s -j64 bzImage' (10 runs):

960212.466648 task-clock # 7.996 CPUs utilized ( +- 0.11% ) [100.00%]
600,078 context-switches # 0.625 K/sec ( +- 0.23% ) [100.00%]
32,708 cpu-migrations # 0.034 K/sec ( +- 0.51% ) [100.00%]
25,450,046 page-faults # 0.027 M/sec ( +- 0.00% )
3,141,378,247,404 cycles # 3.272 GHz ( +- 0.09% ) [83.32%]
2,398,997,896,542 stalled-cycles-frontend # 76.37% frontend cycles idle ( +- 0.16% ) [83.39%]
1,841,987,157,784 stalled-cycles-backend # 58.64% backend cycles idle ( +- 0.19% ) [66.70%]
1,798,363,791,924 instructions # 0.57 insns per cycle
# 1.33 stalled cycles per insn ( +- 0.12% ) [83.39%]
403,257,285,840 branches # 419.967 M/sec ( +- 0.07% ) [83.39%]
17,552,193,349 branch-misses # 4.35% of all branches ( +- 0.07% ) [83.20%]

120.079804432 seconds time elapsed ( +- 0.11% )

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 14:50:59

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 07:45:41AM -0700, H. Peter Anvin wrote:
> Right... brain not awake yet, sorry, and responding from my phone.
>
> I am not to worried about trading off a bit of additional branch mispredicts if everything else wins, but Ingo does have a valid question if we are measuring the right thing.
>
> The effect is definitely big enough that it would be good to understand it one way or the other.
>
> And regardless of the source... nice job!

Yeah, see my other reply. Ingo's way of tracing this doesn't show any
speedup. So I need to find out now why do I see the speedup the way I
run it.

Keep you posted.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 22:28:12

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 04:48:51PM +0200, Borislav Petkov wrote:
> And yes, this way we don't see the speedup - numbers are almost the
> same. Now on to find out why do I see a speedup with my way of running
> the trace.

Ok, I think I know what happens:

When I do:

perf stat --repeat 10 -a --sync --pre 'make -s clean; echo 1 > /proc/sys/vm/drop_caches' make -s -j64 bzImage

I get:

Performance counter stats for 'make -s -j64 bzImage' (10 runs):

961485.910628 task-clock # 7.996 CPUs utilized ( +- 0.13% ) [100.00%]
603,572 context-switches # 0.628 K/sec ( +- 0.30% ) [100.00%]
33,044 cpu-migrations # 0.034 K/sec ( +- 0.42% ) [100.00%]
25,450,364 page-faults # 0.026 M/sec ( +- 0.00% )
3,143,626,158,370 cycles # 3.270 GHz ( +- 0.12% ) [83.38%]
2,405,039,723,306 stalled-cycles-frontend # 76.51% frontend cycles idle ( +- 0.09% ) [83.25%]
1,844,508,780,556 stalled-cycles-backend # 58.67% backend cycles idle ( +- 0.19% ) [66.75%]
1,799,457,879,494 instructions # 0.57 insns per cycle
# 1.34 stalled cycles per insn ( +- 0.15% ) [83.36%]
403,458,465,170 branches # 419.620 M/sec ( +- 0.06% ) [83.38%]
17,545,329,408 branch-misses # 4.35% of all branches ( +- 0.11% ) [83.25%]

120.239128672 seconds time elapsed ( +- 0.13% )

VS when I do

perf stat --repeat 10 -a --sync ../build-kernel.sh

where the script contains the same commands:

$ cat ../build-kernel.sh
#!/bin/bash

make -s clean
echo 1 > /proc/sys/vm/drop_caches
make -s -j64 bzImage
$

I get:

Performance counter stats for '../build-kernel.sh' (10 runs):

1032358.179282 task-clock # 7.996 CPUs utilized ( +- 0.09% ) [100.00%]
635,967 context-switches # 0.616 K/sec ( +- 0.15% ) [100.00%]
37,220 cpu-migrations # 0.036 K/sec ( +- 0.27% ) [100.00%]
26,005,286 page-faults # 0.025 M/sec ( +- 0.00% )
3,164,022,396,373 cycles # 3.065 GHz ( +- 0.10% ) [83.37%]
2,434,722,583,577 stalled-cycles-frontend # 76.95% frontend cycles idle ( +- 0.11% ) [83.34%]
1,865,760,946,076 stalled-cycles-backend # 58.97% backend cycles idle ( +- 0.18% ) [66.76%]
1,810,237,888,844 instructions # 0.57 insns per cycle
# 1.34 stalled cycles per insn ( +- 0.10% ) [83.40%]
406,259,324,254 branches # 393.526 M/sec ( +- 0.12% ) [83.32%]
17,610,395,405 branch-misses # 4.33% of all branches ( +- 0.09% ) [83.21%]

129.102139999 seconds time elapsed ( +- 0.09% )

The difference is, in the second case, we're tracing those two also:

make -s clean
echo 1 > /proc/sys/vm/drop_caches

which could be responsible for the variance in timings. I'll run those
tomorrow to confirm.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-01 22:35:49

by Wedson Almeida Filho

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 1, 2013 at 3:28 PM, Borislav Petkov <[email protected]> wrote:
>
> perf stat --repeat 10 -a --sync --pre 'make -s clean; echo 1 > /proc/sys/vm/drop_caches' make -s -j64 bzImage

How many CPUs do you have in your system? Maybe -j64 vs -jNUM_CPUs
affects your measurements as well.

2013-07-01 22:44:31

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Mon, Jul 01, 2013 at 03:35:47PM -0700, Wedson Almeida Filho wrote:
> On Mon, Jul 1, 2013 at 3:28 PM, Borislav Petkov <[email protected]> wrote:
> >
> > perf stat --repeat 10 -a --sync --pre 'make -s clean; echo 1 > /proc/sys/vm/drop_caches' make -s -j64 bzImage
>
> How many CPUs do you have in your system? Maybe -j64 vs -jNUM_CPUs
> affects your measurements as well.

8. But that shouldn't matter since I made the non-differing measurements two
mails back with -j64.

Also -j9, i.e. -j$(($NUM_CPUS+1)) gives "121.613217871 seconds time
elapsed" because with -j9 the probability of some core not executing a
make thread for whatever reason is higher than with -j64. But it is only
as high as an additional 1s with this workload.

I think with -j64 Ingo meant to saturate the scheduler to make sure
there always are runnable threads more than cores available so that we
can maximize the core utilization with threads running our workload.

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-02 06:39:18

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

* Borislav Petkov <[email protected]> wrote:

> On Mon, Jul 01, 2013 at 03:35:47PM -0700, Wedson Almeida Filho wrote:
> > On Mon, Jul 1, 2013 at 3:28 PM, Borislav Petkov <[email protected]> wrote:
> > >
> > > perf stat --repeat 10 -a --sync --pre 'make -s clean; echo 1 > /proc/sys/vm/drop_caches' make -s -j64 bzImage
> >
> > How many CPUs do you have in your system? Maybe -j64 vs -jNUM_CPUs
> > affects your measurements as well.
>
> 8. But that shouldn't matter since I made the non-differing measurements
> two mails back with -j64.
>
> Also -j9, i.e. -j$(($NUM_CPUS+1)) gives "121.613217871 seconds time
> elapsed" because with -j9 the probability of some core not executing a
> make thread for whatever reason is higher than with -j64. But it is only
> as high as an additional 1s with this workload.
>
> I think with -j64 Ingo meant to saturate the scheduler to make sure
> there always are runnable threads more than cores available so that we
> can maximize the core utilization with threads running our workload.

Yeah - I didn't know your CPU count, -j64 is what I use.

Also, just in case it wasn't clear: thanks for the measurements - and I'd
be in favor of merging this patch if it shows any improvement or if
measurements lie within noise, because per asm review the change should be
a win.

Thanks,

Ingo

2013-07-02 10:29:23

by Borislav Petkov

[permalink] [raw]

Subject: Re: [PATCH] x86: Use asm-goto to implement mutex fast path on x86-64

On Tue, Jul 02, 2013 at 08:39:12AM +0200, Ingo Molnar wrote:
> Yeah - I didn't know your CPU count, -j64 is what I use.

Right, but the -j make jobs argument - whenever it is higher than the
core count - shouldn't matter too much to the workload because all those
threads remain runnable but simply wait to get a shot to run.

Maybe the overhead of setting up threads which are more than necessary
could be of issue although those measurements didn't show that. It
actually showed that -j64 build is a second faster on the average than
-j(core_count+1).

> Also, just in case it wasn't clear: thanks for the measurements

I thank you guys for listening - it is so much fun playing with this! :)

> - and I'd be in favor of merging this patch if it shows any
> improvement or if measurements lie within noise, because per asm
> review the change should be a win.

Right, so we can say for sure that machine utilization drops a bit:

+ 600,993 context-switches
- 600,078 context-switches

- 3,146,429,834,505 cycles
+ 3,141,378,247,404 cycles

- 2,402,804,186,892 stalled-cycles-frontend
+ 2,398,997,896,542 stalled-cycles-frontend

- 1,844,806,444,182 stalled-cycles-backend
+ 1,841,987,157,784 stalled-cycles-backend

- 1,801,184,009,281 instructions
+ 1,798,363,791,924 instructions

and a couple more.

Considering the simple change, this is clearly a win albeit a small one.

Disadvantages:

- 25,449,932 page-faults
+ 25,450,046 page-faults

- 402,482,696,262 branches
+ 403,257,285,840 branches

- 17,550,736,725 branch-misses
+ 17,552,193,349 branch-misses

It looks to me like this way we're a wee bit less predictable to the
machine but it seems it recovers at some point. Again, considering it
doesn't hurt runtime or some other aspect more gravely, we can accept
them.

The moral of the story: never ever use prerequisite stuff like

echo <N> > .../drop_caches

in the to-be-traced workload because it lies to ya:

$ cat ../build-kernel.sh
#!/bin/bash

make -s clean
echo 1 > /proc/sys/vm/drop_caches

$ perf stat --repeat 10 -a --sync --pre '../build-kernel.sh' make -s -j64 bzImage

Performance counter stats for 'make -s -j64 bzImage' (10 runs):

960601.373972 task-clock # 7.996 CPUs utilized ( +- 0.19% ) [100.00%]
601,511 context-switches # 0.626 K/sec ( +- 0.16% ) [100.00%]
32,780 cpu-migrations # 0.034 K/sec ( +- 0.31% ) [100.00%]
25,449,646 page-faults # 0.026 M/sec ( +- 0.00% )
3,142,081,058,378 cycles # 3.271 GHz ( +- 0.11% ) [83.40%]
2,401,261,614,189 stalled-cycles-frontend # 76.42% frontend cycles idle ( +- 0.08% ) [83.39%]
1,845,047,843,816 stalled-cycles-backend # 58.72% backend cycles idle ( +- 0.14% ) [66.65%]
1,797,566,509,722 instructions # 0.57 insns per cycle
# 1.34 stalled cycles per insn ( +- 0.10% ) [83.43%]
403,531,133,058 branches # 420.082 M/sec ( +- 0.09% ) [83.37%]
17,562,347,910 branch-misses # 4.35% of all branches ( +- 0.10% ) [83.20%]

120.128371521 seconds time elapsed ( +- 0.19% )

VS

$ cat ../build-kernel.sh
#!/bin/bash

make -s clean
echo 1 > /proc/sys/vm/drop_caches
make -s -j64 bzImage

$ perf stat --repeat 10 -a --sync ../build-kernel.sh

Performance counter stats for '../build-kernel.sh' (10 runs):

1032946.552711 task-clock # 7.996 CPUs utilized ( +- 0.09% ) [100.00%]
636,651 context-switches # 0.616 K/sec ( +- 0.13% ) [100.00%]
37,443 cpu-migrations # 0.036 K/sec ( +- 0.31% ) [100.00%]
26,005,318 page-faults # 0.025 M/sec ( +- 0.00% )
3,164,715,146,894 cycles # 3.064 GHz ( +- 0.10% ) [83.38%]
2,436,459,399,308 stalled-cycles-frontend # 76.99% frontend cycles idle ( +- 0.10% ) [83.35%]
1,877,644,323,184 stalled-cycles-backend # 59.33% backend cycles idle ( +- 0.20% ) [66.52%]
1,815,075,000,778 instructions # 0.57 insns per cycle
# 1.34 stalled cycles per insn ( +- 0.09% ) [83.19%]
406,020,700,850 branches # 393.070 M/sec ( +- 0.07% ) [83.40%]
17,578,808,228 branch-misses # 4.33% of all branches ( +- 0.12% ) [83.35%]

129.176026516 seconds time elapsed ( +- 0.09% )

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

2013-07-05 14:25:20

by tip-bot for Vasyl Gomonovych

[permalink] [raw]

Subject: [tip:x86/cpu] x86, cpufeature: Use new CC_HAVE_ASM_GOTO

Commit-ID: 62122fd7dadac09704782d8bc051fb898a0272bd
Gitweb: http://git.kernel.org/tip/62122fd7dadac09704782d8bc051fb898a0272bd
Author: Borislav Petkov <[email protected]>
AuthorDate: Fri, 28 Jun 2013 18:41:41 +0200
Committer: H. Peter Anvin <[email protected]>
CommitDate: Fri, 28 Jun 2013 15:26:48 -0700

x86, cpufeature: Use new CC_HAVE_ASM_GOTO

... for checking for "asm goto" compiler support. It is more explicit
this way and we cover the cases where distros have backported that
support even to gcc versions < 4.5.

Signed-off-by: Borislav Petkov <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: H. Peter Anvin <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 47538a6..d3f5c63 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -366,9 +366,10 @@ extern bool __static_cpu_has_safe(u16 bit);
*/
static __always_inline __pure bool __static_cpu_has(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO

#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
+
/*
* Catch too early usage of this before alternatives
* have run.
@@ -384,6 +385,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
+
#endif

asm goto("1: jmp %l[t_no]\n"
@@ -406,7 +408,9 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
warn_pre_alternatives();
return false;
#endif
-#else /* GCC_VERSION >= 40500 */
+
+#else /* CC_HAVE_ASM_GOTO */
+
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $0,%0\n"
@@ -427,7 +431,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
".previous\n"
: "=qm" (flag) : "i" (bit));
return flag;
-#endif
+
+#endif /* CC_HAVE_ASM_GOTO */
}

#define static_cpu_has(bit) \
@@ -441,7 +446,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)

static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
-#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5
+#ifdef CC_HAVE_ASM_GOTO
/*
* We need to spell the jumps to the compiler because, depending on the offset,
* the replacement jump can be bigger than the original jump, and this we cannot
@@ -475,7 +480,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
return false;
t_dynamic:
return __static_cpu_has_safe(bit);
-#else /* GCC_VERSION >= 40500 */
+#else
u8 flag;
/* Open-coded due to __stringify() in ALTERNATIVE() */
asm volatile("1: movb $2,%0\n"
@@ -511,7 +516,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
: "=qm" (flag)
: "i" (bit), "i" (X86_FEATURE_ALWAYS));
return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
-#endif
+#endif /* CC_HAVE_ASM_GOTO */
}

#define static_cpu_has_safe(bit) \