Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets
to improve code using cmpxchg64_local(). On 64-bit targets, the
generated assembly improves from:
3e28: 31 c0 xor %eax,%eax
3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
3e2f: 48 85 c0 test %rax,%rax
3e32: 0f 85 9f 00 00 00 jne 3ed7 <...>
to:
3e28: 31 c0 xor %eax,%eax
3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...>
where a compare instruction after cmpxchg is saved. The improvements
for 32-bit targets are even more noticeable, because double-word
compare after cmpxchg8b gets eliminated.
Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/cmpxchg_32.h | 34 +++++++++++++++++++++++++++++++
arch/x86/include/asm/cmpxchg_64.h | 6 ++++++
2 files changed, 40 insertions(+)
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 9e0d330dd5d0..9dedc13d5a77 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -64,6 +64,11 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne
return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
+}
+
#ifdef CONFIG_X86_CMPXCHG64
#define arch_cmpxchg64 __cmpxchg64
@@ -72,6 +77,8 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne
#define arch_try_cmpxchg64 __try_cmpxchg64
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
#else
/*
@@ -150,6 +157,33 @@ static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64
}
#define arch_try_cmpxchg64 arch_try_cmpxchg64
+#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \
+ "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu_local(ptr, oldp, new);
+}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
+
#endif
#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index c1d6cd58f809..5e241306db26 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,12 @@
arch_try_cmpxchg((ptr), (po), (n)); \
})
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
+
union __u128_halves {
u128 full;
struct {
--
2.42.0
The following commit has been merged into the locking/core branch of tip:
Commit-ID: d26e46f6bf329cfcc469878709baa41d3bfc7cc3
Gitweb: https://git.kernel.org/tip/d26e46f6bf329cfcc469878709baa41d3bfc7cc3
Author: Uros Bizjak <[email protected]>
AuthorDate: Sun, 14 Apr 2024 18:12:43 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Sun, 14 Apr 2024 22:40:54 +02:00
locking/atomic/x86: Introduce arch_try_cmpxchg64_local()
Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets
to improve code using cmpxchg64_local(). On 64-bit targets, the
generated assembly improves from:
3e28: 31 c0 xor %eax,%eax
3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
3e2f: 48 85 c0 test %rax,%rax
3e32: 0f 85 9f 00 00 00 jne 3ed7 <...>
to:
3e28: 31 c0 xor %eax,%eax
3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...>
where a TEST instruction after CMPXCHG is saved. The improvements
for 32-bit targets are even more noticeable, because double-word
compare after CMPXCHG8B gets eliminated.
Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Waiman Long <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/cmpxchg_32.h | 34 ++++++++++++++++++++++++++++++-
arch/x86/include/asm/cmpxchg_64.h | 6 +++++-
2 files changed, 40 insertions(+)
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 9e0d330..9dedc13 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -64,6 +64,11 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne
return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
+}
+
#ifdef CONFIG_X86_CMPXCHG64
#define arch_cmpxchg64 __cmpxchg64
@@ -72,6 +77,8 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne
#define arch_try_cmpxchg64 __try_cmpxchg64
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
#else
/*
@@ -150,6 +157,33 @@ static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64
}
#define arch_try_cmpxchg64 arch_try_cmpxchg64
+#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \
+ "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu_local(ptr, oldp, new);
+}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
+
#endif
#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index c1d6cd5..5e24130 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,12 @@
arch_try_cmpxchg((ptr), (po), (n)); \
})
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
+
union __u128_halves {
u128 full;
struct {
On Sun, Apr 14, 2024 at 08:46:39PM -0000, tip-bot2 for Uros Bizjak wrote:
> The following commit has been merged into the locking/core branch of tip:
>
> Commit-ID: d26e46f6bf329cfcc469878709baa41d3bfc7cc3
> Gitweb: https://git.kernel.org/tip/d26e46f6bf329cfcc469878709baa41d3bfc7cc3
> Author: Uros Bizjak <[email protected]>
> AuthorDate: Sun, 14 Apr 2024 18:12:43 +02:00
> Committer: Ingo Molnar <[email protected]>
> CommitterDate: Sun, 14 Apr 2024 22:40:54 +02:00
>
> locking/atomic/x86: Introduce arch_try_cmpxchg64_local()
>
> Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets
> to improve code using cmpxchg64_local(). On 64-bit targets, the
> generated assembly improves from:
>
> 3e28: 31 c0 xor %eax,%eax
> 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
> 3e2f: 48 85 c0 test %rax,%rax
> 3e32: 0f 85 9f 00 00 00 jne 3ed7 <...>
>
> to:
>
> 3e28: 31 c0 xor %eax,%eax
> 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
> 3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...>
>
> where a TEST instruction after CMPXCHG is saved. The improvements
> for 32-bit targets are even more noticeable, because double-word
> compare after CMPXCHG8B gets eliminated.
>
> Signed-off-by: Uros Bizjak <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Waiman Long <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]
> ---
> arch/x86/include/asm/cmpxchg_32.h | 34 ++++++++++++++++++++++++++++++-
> arch/x86/include/asm/cmpxchg_64.h | 6 +++++-
> 2 files changed, 40 insertions(+)
Ok, maybe I'm missing the point here or maybe the commit message doesn't
explain but how does this big diffstat justify one less insn?
And no, 32-bit doesn't matter.
This looks like too crazy micro-optimization to me to be worth the 40
insertions.
But I could be wrong and I'd gladly read explanations for why I am.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Mon, Apr 15, 2024 at 10:12 PM Borislav Petkov <[email protected]> wrote:
> > locking/atomic/x86: Introduce arch_try_cmpxchg64_local()
> >
> > Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets
> > to improve code using cmpxchg64_local(). On 64-bit targets, the
> > generated assembly improves from:
> >
> > 3e28: 31 c0 xor %eax,%eax
> > 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
> > 3e2f: 48 85 c0 test %rax,%rax
> > 3e32: 0f 85 9f 00 00 00 jne 3ed7 <...>
> >
> > to:
> >
> > 3e28: 31 c0 xor %eax,%eax
> > 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13)
> > 3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...>
> >
> > where a TEST instruction after CMPXCHG is saved. The improvements
> > for 32-bit targets are even more noticeable, because double-word
> > compare after CMPXCHG8B gets eliminated.
>
> Ok, maybe I'm missing the point here or maybe the commit message doesn't
> explain but how does this big diffstat justify one less insn?
We are dealing with locking primitives, probably the hottest part of
the kernel. For 64-bits, the patch is effectively a couple of lines,
reusing and extending existing macros, the line count for a modern
32-bit target is also a couple of lines, but there the saved insn
count is much higher, around 10 instructions.
What bothers you is the line count of cmpxchg8b emulation, necessary
to handle !CONFIG_X86_CMPXCHG64 targets. They are still alive, and
cmpxchg8b emulation code was recently improved (not by me!) to
correctly handle the ZF flag. The added code just builds on that.
> And no, 32-bit doesn't matter.
Really? Was this decision reached by the community consensus?
The linux kernel has many uses, and using it for servers by a big
company, you are the voice of, is only one part of the whole picture.
I'm sure that 32-bit is quite relevant for embedded users and more
than relevant to a student or an enthusiast in some remote part of the
world. As a maintainer, you should also take care of the communities
that are somehow neglected, where unilateral decisions like the one
above can have unwanted consequences.
> This looks like too crazy micro-optimization to me to be worth the 40
> insertions.
If the line count is the problem, I can easily parametrize new and
existing big macro descriptions in a follow-up patch. However, I was
advised to not mix everything together in one patch, but rest assured,
the creation and testing of the follow-up patch would take me less
time than writing the message you are reading.
> But I could be wrong and I'd gladly read explanations for why I am.
I would understand if someone asked you to write some new
functionality for a seemingly obsolete target. Everybody would
understand your rejection. But this improvement was written by me in
my time, and demonstrated some benefit to the existing code. The patch
is almost trivial and it took maybe 5 minutes of the maintainer's time
to approve it. It brings no future maintenance burden, but it perhaps
improves someone's life a tiny bit.
Last, but not least, I'm bringing some ideas from the compiler
development community, where the attitude to redundant instructions is
totally different. It could take weeks of effort and considerable
rewrite of compiler functionality just to remove one instruction ;)
Micro-optimizations add up!
Thanks for reading this,
Uros.
On Wed, Apr 17, 2024 at 06:24:21PM +0200, Uros Bizjak wrote:
> We are dealing with locking primitives, probably the hottest part of
> the kernel. For 64-bits, the patch is effectively a couple of lines,
> reusing and extending existing macros,
Ok.
> the line count for a modern 32-bit target is also a couple of lines,
> but there the saved insn count is much higher, around 10 instructions.
Yah, that __arch_try_cmpxchg64_emu_local() thing with yet another
alternative in there. So that's not a couple of lines - it is yet
another cryptic alternative we need to pay attention to.
> Really? Was this decision reached by the community consensus?
Nothing official. Unofficially, we don't care.
> The linux kernel has many uses, and using it for servers by a big
> company, you are the voice of,
No, here I'm wearing my maintainer hat.
> I'm sure that 32-bit is quite relevant for embedded users and more
People keep dangling those "embedded users" each time. Which users are
those? I haven't seen anyone complaining about 32-bit kernels being
broken or testing them. Because we keep breaking them and no one
notices. Maybe that's a sign for how much they're used.
Although, I broke 32-bit recently and people caught it so there are some
straddlers from time to time. But that's very seldom. And each time we
tell them to switch to 64-bit.
> than relevant to a student or an enthusiast in some remote part of the
> world.
Trust me, they have 64-bit CPUs. Most of the 32-bit CPUs they had are
probably dead already. Like mine.
32-bit only CPUs like P4, for example, should be trashed just because
they're contributing to global warming. :-P
> As a maintainer, you should also take care of the communities
> that are somehow neglected, where unilateral decisions like the one
> above can have unwanted consequences.
We still keep 32-bit kernels alive - no one has dropped them yet - we
just don't add new features.
> If the line count is the problem, I can easily parametrize new and
> existing big macro descriptions in a follow-up patch. However, I was
> advised to not mix everything together in one patch, but rest assured,
> the creation and testing of the follow-up patch would take me less
> time than writing the message you are reading.
I'm simply making sure we're not going off the rails with
micro-optimizing for no apparent reason.
Saving a
test %rax,%rax
doesn't need fixing in my book. Because I don't think you'll be able to
even measure it.
> It brings no future maintenance burden, but it perhaps improves
> someone's life a tiny bit.
This is where you and I disagree: touching that alternative in
__arch_try_cmpxchg64_emu_local() does as we tend to change them from
time to time, especially in recent times.
And I wouldn't mind touching it but if it is there to save 10 insns on
32-bit - which doesn't matter - then why bother?
Or do you have a relevant 32-bit workload which brings any improvement
by this change?
> Last, but not least, I'm bringing some ideas from the compiler
> development community, where the attitude to redundant instructions is
> totally different. It could take weeks of effort and considerable
> rewrite of compiler functionality just to remove one instruction ;)
> Micro-optimizations add up!
I'm sure but they all need to be weighed in. Zapping a TEST REG,REG is
not worth it. On most machines, that ALU insn executes in 1 cycle.
I wanna say, such "optimizations" should be checked by benchmarks to see
whether they even give any improvements but we can't check every patch.
IOW, all the patches we're adding should answer the "Is it really worth
the effort?" question. And don't forget that "it brings no future
maintenance burden" is wrong. It brings a maintenance burden every time
we refactor the kernel. And we do that all the time. So the more sites
you have to touch, the more it adds up.
So even if your patch saves 10 insns but there's not a single workload
where it matters, then don't bother. There are other, lower hanging
fruits we need to tackle first.
> Thanks for reading this,
Thanks for taking the time to explain how you're seeing it.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Wed, Apr 17, 2024 at 8:41 PM Borislav Petkov <[email protected]> wrote:
> > If the line count is the problem, I can easily parametrize new and
> > existing big macro descriptions in a follow-up patch. However, I was
> > advised to not mix everything together in one patch, but rest assured,
> > the creation and testing of the follow-up patch would take me less
> > time than writing the message you are reading.
>
> I'm simply making sure we're not going off the rails with
> micro-optimizing for no apparent reason.
>
> Saving a
>
> test %rax,%rax
>
> doesn't need fixing in my book. Because I don't think you'll be able to
> even measure it.
The above is perhaps a little unfortunate example taken from
if (cmpxchg64(...))
where the check is against zero. The compiler can optimize the check
to a TEST insn in this particular case, but otherwise CMP will be
emitted for different usages. Not a big difference, but a register has
to be kept live across cmpxchg8b.
> > It brings no future maintenance burden, but it perhaps improves
> > someone's life a tiny bit.
>
> This is where you and I disagree: touching that alternative in
> __arch_try_cmpxchg64_emu_local() does as we tend to change them from
> time to time, especially in recent times.
>
> And I wouldn't mind touching it but if it is there to save 10 insns on
> 32-bit - which doesn't matter - then why bother?
>
> Or do you have a relevant 32-bit workload which brings any improvement
> by this change?
There is one important issue. When a register (or two for double-word
values) has to be kept live for a compare, the register pressure on
32bit targets around cmpxchg8b goes through the roof, and when using
the frame pointer (and maybe some fixed register, e.g. PIC), the
register allocator runs out of available registers. The number of
spills around cmpxchg8b signals the troubles register allocator goes
through to "fix" everything, so from the compiler PoV any relief is
more than welcome here. Even in GCC internal libraries, we have had to
take a special approach with this insn to avoid internal compiler
errors. The kernel was quite lucky here ;)
Thanks and best regards,
Uros.
On April 17, 2024 9:21:29 PM GMT+02:00, Uros Bizjak <[email protected]> wrote:
>The above is perhaps a little unfortunate example taken from
>
>if (cmpxchg64(...))
>
>where the check is against zero. The compiler can optimize the check
>to a TEST insn in this particular case, but otherwise CMP will be
>emitted for different usages. Not a big difference, but a register has
>to be kept live across cmpxchg8b.
>
...
>
>There is one important issue. When a register (or two for double-word
>values) has to be kept live for a compare, the register pressure on
>32bit targets around cmpxchg8b goes through the roof, and when using
>the frame pointer (and maybe some fixed register, e.g. PIC), the
>register allocator runs out of available registers. The number of
>spills around cmpxchg8b signals the troubles register allocator goes
>through to "fix" everything, so from the compiler PoV any relief is
>more than welcome here. Even in GCC internal libraries, we have had to
>take a special approach with this insn to avoid internal compiler
>errors. The kernel was quite lucky here ;)
That would've been a lot better reason to justify the change. I think you should put those things in the commit messages.
Thx.
--
Sent from a small device: formatting sucks and brevity is inevitable.