2022-05-10 20:35:32

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

This patch adds try_cmpxchg64 to improve code around cmpxchg8b. While
the resulting code improvements on x86_64 are minor (a compare and a move saved),
the improvements on x86_32 are quite noticeable. The code improves from:

84: 89 74 24 30 mov %esi,0x30(%esp)
88: 89 fe mov %edi,%esi
8a: 0f b7 0c 02 movzwl (%edx,%eax,1),%ecx
8e: c1 e1 08 shl $0x8,%ecx
91: 0f b7 c9 movzwl %cx,%ecx
94: 89 4c 24 34 mov %ecx,0x34(%esp)
98: 8b 96 24 1e 00 00 mov 0x1e24(%esi),%edx
9e: 8b 86 20 1e 00 00 mov 0x1e20(%esi),%eax
a4: 8b 5c 24 34 mov 0x34(%esp),%ebx
a8: 8b 7c 24 30 mov 0x30(%esp),%edi
ac: 89 44 24 38 mov %eax,0x38(%esp)
b0: 0f b6 44 24 38 movzbl 0x38(%esp),%eax
b5: 8b 4c 24 38 mov 0x38(%esp),%ecx
b9: 89 54 24 3c mov %edx,0x3c(%esp)
bd: 83 e0 fd and $0xfffffffd,%eax
c0: 89 5c 24 64 mov %ebx,0x64(%esp)
c4: 8b 54 24 3c mov 0x3c(%esp),%edx
c8: 89 4c 24 60 mov %ecx,0x60(%esp)
cc: 8b 4c 24 34 mov 0x34(%esp),%ecx
d0: 88 44 24 60 mov %al,0x60(%esp)
d4: 8b 44 24 38 mov 0x38(%esp),%eax
d8: c6 44 24 62 f2 movb $0xf2,0x62(%esp)
dd: 8b 5c 24 60 mov 0x60(%esp),%ebx
e1: f0 0f c7 0f lock cmpxchg8b (%edi)
e5: 89 d1 mov %edx,%ecx
e7: 8b 54 24 3c mov 0x3c(%esp),%edx
eb: 33 44 24 38 xor 0x38(%esp),%eax
ef: 31 ca xor %ecx,%edx
f1: 09 c2 or %eax,%edx
f3: 75 a3 jne 98 <t+0x98>

to:

84: 0f b7 0c 02 movzwl (%edx,%eax,1),%ecx
88: c1 e1 08 shl $0x8,%ecx
8b: 0f b7 c9 movzwl %cx,%ecx
8e: 8b 86 20 1e 00 00 mov 0x1e20(%esi),%eax
94: 8b 96 24 1e 00 00 mov 0x1e24(%esi),%edx
9a: 89 4c 24 64 mov %ecx,0x64(%esp)
9e: 89 c3 mov %eax,%ebx
a0: 89 44 24 60 mov %eax,0x60(%esp)
a4: 83 e3 fd and $0xfffffffd,%ebx
a7: c6 44 24 62 f2 movb $0xf2,0x62(%esp)
ac: 88 5c 24 60 mov %bl,0x60(%esp)
b0: 8b 5c 24 60 mov 0x60(%esp),%ebx
b4: f0 0f c7 0f lock cmpxchg8b (%edi)
b8: 75 d4 jne 8e <t+0x8e>

The implementation extends the implementation of existing cmpxchg64.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Boqun Feng <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Marco Elver <[email protected]>
---
arch/x86/include/asm/cmpxchg_32.h | 43 ++++++++++++++++++++++
arch/x86/include/asm/cmpxchg_64.h | 6 +++
include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
scripts/atomic/gen-atomic-instrumented.sh | 2 +-
4 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 0a7fe0321613..e874ff7f7529 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
#define arch_cmpxchg64_local(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
(unsigned long long)(n)))
+#define arch_try_cmpxchg64(ptr, po, n) \
+ ((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
+ (unsigned long long)(n)))
#endif

static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
@@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
return prev;
}

+static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+{
+ bool success;
+ u64 prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %2"
+ CC_SET(z)
+ : CC_OUT(z) (success),
+ "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "1" (*pold)
+ : "memory");
+
+ if (unlikely(!success))
+ *pold = prev;
+ return success;
+}
+
#ifndef CONFIG_X86_CMPXCHG64
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
@@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
: "memory"); \
__ret; })

+#define arch_try_cmpxchg64(ptr, po, n) \
+({ \
+ bool success; \
+ __typeof__(*(ptr)) __prev; \
+ __typeof__(ptr) _old = (__typeof__(ptr))(po); \
+ __typeof__(*(ptr)) __old = *_old; \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io(LOCK_PREFIX_HERE \
+ "call cmpxchg8b_emu", \
+ "lock; cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__prev), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ success = (__prev == __old); \
+ if (unlikely(!success)) \
+ *_old = __prev; \
+ likely(success); \
+})
#endif

#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 072e5459fe2f..250187ac8248 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
arch_cmpxchg_local((ptr), (o), (n)); \
})

+#define arch_try_cmpxchg64(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg((ptr), (po), (n)); \
+})
+
#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)

#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 5d69b143c28e..7a139ec030b0 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -2006,6 +2006,44 @@ atomic_long_dec_if_positive(atomic_long_t *v)
arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

+#define try_cmpxchg64(ptr, oldp, ...) \
+({ \
+ typeof(ptr) __ai_ptr = (ptr); \
+ typeof(oldp) __ai_oldp = (oldp); \
+ kcsan_mb(); \
+ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+ arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_acquire(ptr, oldp, ...) \
+({ \
+ typeof(ptr) __ai_ptr = (ptr); \
+ typeof(oldp) __ai_oldp = (oldp); \
+ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+ arch_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_release(ptr, oldp, ...) \
+({ \
+ typeof(ptr) __ai_ptr = (ptr); \
+ typeof(oldp) __ai_oldp = (oldp); \
+ kcsan_release(); \
+ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+ arch_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
+({ \
+ typeof(ptr) __ai_ptr = (ptr); \
+ typeof(oldp) __ai_oldp = (oldp); \
+ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+ arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
#define cmpxchg_local(ptr, ...) \
({ \
typeof(ptr) __ai_ptr = (ptr); \
@@ -2045,4 +2083,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
})

#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 87c974b93032afd42143613434d1a7788fa598f9
+// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 68f902731d01..77c06526a574 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -166,7 +166,7 @@ grep '^[a-z]' "$1" | while read name meta args; do
done


-for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg"; do
+for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg" "try_cmpxchg64"; do
for order in "" "_acquire" "_release" "_relaxed"; do
gen_xchg "${xchg}" "${order}" ""
printf "\n"
--
2.35.1



2022-05-10 20:59:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
> This patch adds try_cmpxchg64 to improve code around cmpxchg8b. While
> the resulting code improvements on x86_64 are minor (a compare and a move saved),
> the improvements on x86_32 are quite noticeable. The code improves from:

What user of cmpxchg64 is this?


2022-05-14 00:17:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:

For the Changelog I would focus on the 64bit improvement and leave 32bit
as a side-note.

> ---
> arch/x86/include/asm/cmpxchg_32.h | 43 ++++++++++++++++++++++
> arch/x86/include/asm/cmpxchg_64.h | 6 +++
> include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
> scripts/atomic/gen-atomic-instrumented.sh | 2 +-
> 4 files changed, 89 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
> index 0a7fe0321613..e874ff7f7529 100644
> --- a/arch/x86/include/asm/cmpxchg_32.h
> +++ b/arch/x86/include/asm/cmpxchg_32.h
> @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
> #define arch_cmpxchg64_local(ptr, o, n) \
> ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
> (unsigned long long)(n)))
> +#define arch_try_cmpxchg64(ptr, po, n) \
> + ((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
> + (unsigned long long)(n)))
> #endif
>
> static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
> @@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> return prev;
> }
>
> +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
> +{
> + bool success;
> + u64 prev;
> + asm volatile(LOCK_PREFIX "cmpxchg8b %2"
> + CC_SET(z)
> + : CC_OUT(z) (success),
> + "=A" (prev),
> + "+m" (*ptr)
> + : "b" ((u32)new),
> + "c" ((u32)(new >> 32)),
> + "1" (*pold)
> + : "memory");
> +
> + if (unlikely(!success))
> + *pold = prev;

I would prefer this be more like the existing try_cmpxchg code,
perhaps:

u64 old = *pold;

asm volatile (LOCK_PREFIX "cmpxchg8b %[ptr]"
CC_SET(z)
: CC_OUT(z) (success),
[ptr] "+m" (*ptr)
"+A" (old)
: "b" ((u32)new)
"c" ((u32)(new >> 32))
: "memory");

if (unlikely(!success))
*pold = old;

The existing 32bit cmpxchg code is a 'bit' crusty.

> + return success;
> +}
> +
> #ifndef CONFIG_X86_CMPXCHG64
> /*
> * Building a kernel capable running on 80386 and 80486. It may be necessary
> @@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> : "memory"); \
> __ret; })
>
> +#define arch_try_cmpxchg64(ptr, po, n) \
> +({ \
> + bool success; \
> + __typeof__(*(ptr)) __prev; \
> + __typeof__(ptr) _old = (__typeof__(ptr))(po); \
> + __typeof__(*(ptr)) __old = *_old; \
> + __typeof__(*(ptr)) __new = (n); \
> + alternative_io(LOCK_PREFIX_HERE \
> + "call cmpxchg8b_emu", \
> + "lock; cmpxchg8b (%%esi)" , \
> + X86_FEATURE_CX8, \
> + "=A" (__prev), \
> + "S" ((ptr)), "0" (__old), \
> + "b" ((unsigned int)__new), \
> + "c" ((unsigned int)(__new>>32)) \
> + : "memory"); \
> + success = (__prev == __old); \
> + if (unlikely(!success)) \
> + *_old = __prev; \
> + likely(success); \
> +})

Wouldn't this be better written like the normal fallback wrapper?

static __always_inline bool
arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
{
u64 r, o = *old;
r = arch_cmpxchg64(v, o, new);
if (unlikely(r != o))
*old = r;
return likely(r == o);
}

Less magical, same exact code.

2022-05-14 02:08:34

by Uros Bizjak

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Fri, May 13, 2022 at 11:10 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:
>
> For the Changelog I would focus on the 64bit improvement and leave 32bit
> as a side-note.

Thanks, I will rephrase the ChangeLog.

>
> > ---
> > arch/x86/include/asm/cmpxchg_32.h | 43 ++++++++++++++++++++++
> > arch/x86/include/asm/cmpxchg_64.h | 6 +++
> > include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
> > scripts/atomic/gen-atomic-instrumented.sh | 2 +-
> > 4 files changed, 89 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
> > index 0a7fe0321613..e874ff7f7529 100644
> > --- a/arch/x86/include/asm/cmpxchg_32.h
> > +++ b/arch/x86/include/asm/cmpxchg_32.h
> > @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
> > #define arch_cmpxchg64_local(ptr, o, n) \
> > ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
> > (unsigned long long)(n)))
> > +#define arch_try_cmpxchg64(ptr, po, n) \
> > + ((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
> > + (unsigned long long)(n)))
> > #endif
> >
> > static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
> > @@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> > return prev;
> > }
> >
> > +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
> > +{
> > + bool success;
> > + u64 prev;
> > + asm volatile(LOCK_PREFIX "cmpxchg8b %2"
> > + CC_SET(z)
> > + : CC_OUT(z) (success),
> > + "=A" (prev),
> > + "+m" (*ptr)
> > + : "b" ((u32)new),
> > + "c" ((u32)(new >> 32)),
> > + "1" (*pold)
> > + : "memory");
> > +
> > + if (unlikely(!success))
> > + *pold = prev;
>
> I would prefer this be more like the existing try_cmpxchg code,
> perhaps:
>
> u64 old = *pold;
>
> asm volatile (LOCK_PREFIX "cmpxchg8b %[ptr]"
> CC_SET(z)
> : CC_OUT(z) (success),
> [ptr] "+m" (*ptr)
> "+A" (old)
> : "b" ((u32)new)
> "c" ((u32)(new >> 32))
> : "memory");
>
> if (unlikely(!success))
> *pold = old;
>
> The existing 32bit cmpxchg code is a 'bit' crusty.

I was trying to follow the existing __cmpxchg64 as much as possible,
with the intention of a follow-up patch that would modernize
everything in cmpxchg_32.h. I can surely go the other way and submit
modernized new code.

> > + return success;
> > +}
> > +
> > #ifndef CONFIG_X86_CMPXCHG64
> > /*
> > * Building a kernel capable running on 80386 and 80486. It may be necessary
> > @@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
> > : "memory"); \
> > __ret; })
> >
> > +#define arch_try_cmpxchg64(ptr, po, n) \
> > +({ \
> > + bool success; \
> > + __typeof__(*(ptr)) __prev; \
> > + __typeof__(ptr) _old = (__typeof__(ptr))(po); \
> > + __typeof__(*(ptr)) __old = *_old; \
> > + __typeof__(*(ptr)) __new = (n); \
> > + alternative_io(LOCK_PREFIX_HERE \
> > + "call cmpxchg8b_emu", \
> > + "lock; cmpxchg8b (%%esi)" , \
> > + X86_FEATURE_CX8, \
> > + "=A" (__prev), \
> > + "S" ((ptr)), "0" (__old), \
> > + "b" ((unsigned int)__new), \
> > + "c" ((unsigned int)(__new>>32)) \
> > + : "memory"); \
> > + success = (__prev == __old); \
> > + if (unlikely(!success)) \
> > + *_old = __prev; \
> > + likely(success); \
> > +})
>
> Wouldn't this be better written like the normal fallback wrapper?
>
> static __always_inline bool
> arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
> {
> u64 r, o = *old;
> r = arch_cmpxchg64(v, o, new);
> if (unlikely(r != o))
> *old = r;
> return likely(r == o);
> }
>
> Less magical, same exact code.

Also, I tried to follow up the existing #defines. Will improve the
code according to your suggestion here.

Thanks,
Uros.

2022-05-14 02:10:10

by Uros Bizjak

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Fri, May 13, 2022 at 12:20 PM Uros Bizjak <[email protected]> wrote:

> > > +#define arch_try_cmpxchg64(ptr, po, n) \
> > > +({ \
> > > + bool success; \
> > > + __typeof__(*(ptr)) __prev; \
> > > + __typeof__(ptr) _old = (__typeof__(ptr))(po); \
> > > + __typeof__(*(ptr)) __old = *_old; \
> > > + __typeof__(*(ptr)) __new = (n); \
> > > + alternative_io(LOCK_PREFIX_HERE \
> > > + "call cmpxchg8b_emu", \
> > > + "lock; cmpxchg8b (%%esi)" , \
> > > + X86_FEATURE_CX8, \
> > > + "=A" (__prev), \
> > > + "S" ((ptr)), "0" (__old), \
> > > + "b" ((unsigned int)__new), \
> > > + "c" ((unsigned int)(__new>>32)) \
> > > + : "memory"); \
> > > + success = (__prev == __old); \
> > > + if (unlikely(!success)) \
> > > + *_old = __prev; \
> > > + likely(success); \
> > > +})
> >
> > Wouldn't this be better written like the normal fallback wrapper?
> >
> > static __always_inline bool
> > arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
> > {
> > u64 r, o = *old;
> > r = arch_cmpxchg64(v, o, new);
> > if (unlikely(r != o))
> > *old = r;
> > return likely(r == o);
> > }
> >
> > Less magical, same exact code.
>
> Also, I tried to follow up the existing #defines. Will improve the
> code according to your suggestion here.

In the v2 patch, generic fallbacks were introduced, so that
arch_try_cmpxchg64 can be used when only arch_cmpxchg64 is defined.

Uros.

2022-05-16 23:52:16

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> On Mon, May 16, 2022, Maxim Levitsky wrote:
> > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <[email protected]> wrote:
> > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <[email protected]> wrote:
> > > > > > Still, does 32bit actually support that stuff?
> > > > >
> > > > > Unfortunately, it does:
> > > > >
> > > > > kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > > vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > >
> > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > compilation dies for 32bits with:
> > > >
> > > > ...
> > > >
> > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > care about 32bit code these days.
> > > > >
> > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > >
> > > > IMO, if we merge this it should be solely on the benefits to 64-bit code. Yes,
> > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > run 32-bit KVM are KVM developers. 32-bit KVM has been completely broken for
> > > > multiple releases at least once, maybe twice, and no one ever complained.
> > >
> > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > stood in the way, so the effort with 32-bit implementation was mainly
> > > to unblock progression for 64-bit targets.
> >
> > Would that allow tdp mmu to work on 32 bit?
>
> From a purely technical perspective, there's nothing that prevents enabling the
> TDP MMU on 32-bit kernels. The TDP MMU is 64-bit only to simplify the implementation
> and to reduce the maintenance and validation costs.
>

I understand exactly that, so the question, will this patch help make the tdp mmu work transparently
on 32 bit kernels? I heard that 64 bit cmpxchg was one of the main reasons that it is 64 bit only.

I am asking because there was some talk to eliminate the direct mode from the legacy non tdp mmu,
which would simplify its code by a lot, but then it will make 32 bit kernel fail back to shadowing mmu.

I know that nobody needs 32 bit KVM host support, but it is useful to be sure that nesting still works, and
doesn't crash the host and such.

Best regards,
Maxim Levitsky


2022-05-17 01:36:44

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Mon, May 16, 2022, Maxim Levitsky wrote:
> On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> > On Mon, May 16, 2022, Maxim Levitsky wrote:
> > > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <[email protected]> wrote:
> > > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <[email protected]> wrote:
> > > > > > > Still, does 32bit actually support that stuff?
> > > > > >
> > > > > > Unfortunately, it does:
> > > > > >
> > > > > > kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > > > vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > > >
> > > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > > compilation dies for 32bits with:
> > > > >
> > > > > ...
> > > > >
> > > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > > care about 32bit code these days.
> > > > > >
> > > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > > >
> > > > > IMO, if we merge this it should be solely on the benefits to 64-bit code. Yes,
> > > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > > run 32-bit KVM are KVM developers. 32-bit KVM has been completely broken for
> > > > > multiple releases at least once, maybe twice, and no one ever complained.
> > > >
> > > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > > stood in the way, so the effort with 32-bit implementation was mainly
> > > > to unblock progression for 64-bit targets.
> > >
> > > Would that allow tdp mmu to work on 32 bit?
> >
> > From a purely technical perspective, there's nothing that prevents enabling the
> > TDP MMU on 32-bit kernels. The TDP MMU is 64-bit only to simplify the implementation
> > and to reduce the maintenance and validation costs.
>
> I understand exactly that, so the question, will this patch help make the tdp
> mmu work transparently on 32 bit kernels? I heard that 64 bit cmpxchg was
> one of the main reasons that it is 64 bit only.

I don't think it moves the needled much, e.g. non-atomic 64-bit accesses are still
problematic, and we'd have to update the TDP MMU to deal with PAE paging (thanks
NPT). All those problems are solvable, it's purely a matter of the ongoing costs
to solve them.

> I am asking because there was some talk to eliminate the direct mode from the
> legacy non tdp mmu, which would simplify its code by a lot, but then it will
> make 32 bit kernel fail back to shadowing mmu.

Simplify which code? Between the nonpaging code and direct shadow pages in
indirect MMUs, the vast majority of the "direct" support in the legacy MMU needs
to be kept even if TDP support is dropped. And the really nasty stuff, e.g. PAE
roots, would need to be carried over to the TDP MMU.

2022-05-17 03:10:18

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

On Mon, 2022-05-16 at 15:14 +0000, Sean Christopherson wrote:
> On Mon, May 16, 2022, Maxim Levitsky wrote:
> > On Mon, 2022-05-16 at 14:08 +0000, Sean Christopherson wrote:
> > > On Mon, May 16, 2022, Maxim Levitsky wrote:
> > > > On Wed, 2022-05-11 at 21:54 +0200, Uros Bizjak wrote:
> > > > > On Wed, May 11, 2022 at 6:04 PM Sean Christopherson <[email protected]> wrote:
> > > > > > On Wed, May 11, 2022, Uros Bizjak wrote:
> > > > > > > On Wed, May 11, 2022 at 9:54 AM Peter Zijlstra <[email protected]> wrote:
> > > > > > > > Still, does 32bit actually support that stuff?
> > > > > > >
> > > > > > > Unfortunately, it does:
> > > > > > >
> > > > > > > kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
> > > > > > > vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
> > > > > > >
> > > > > > > And when existing cmpxchg64 is substituted with cmpxchg, the
> > > > > > > compilation dies for 32bits with:
> > > > > >
> > > > > > ...
> > > > > >
> > > > > > > > Anyway, your patch looks about right, but I find it *really* hard to
> > > > > > > > care about 32bit code these days.
> > > > > > >
> > > > > > > Thanks, this is also my sentiment, but I hope the patch will enable
> > > > > > > better code and perhaps ease similar situation I have had elsewhere.
> > > > > >
> > > > > > IMO, if we merge this it should be solely on the benefits to 64-bit code. Yes,
> > > > > > KVM still supports 32-bit kernels, but I'm fairly certain the only people that
> > > > > > run 32-bit KVM are KVM developers. 32-bit KVM has been completely broken for
> > > > > > multiple releases at least once, maybe twice, and no one ever complained.
> > > > >
> > > > > Yes, the idea was to improve cmpxchg64 with the implementation of
> > > > > try_cmpxchg64 for 64bit targets. However, the issue with 32bit targets
> > > > > stood in the way, so the effort with 32-bit implementation was mainly
> > > > > to unblock progression for 64-bit targets.
> > > >
> > > > Would that allow tdp mmu to work on 32 bit?
> > >
> > > From a purely technical perspective, there's nothing that prevents enabling the
> > > TDP MMU on 32-bit kernels. The TDP MMU is 64-bit only to simplify the implementation
> > > and to reduce the maintenance and validation costs.
> >
> > I understand exactly that, so the question, will this patch help make the tdp
> > mmu work transparently on 32 bit kernels? I heard that 64 bit cmpxchg was
> > one of the main reasons that it is 64 bit only.
>
> I don't think it moves the needled much, e.g. non-atomic 64-bit accesses are still
> problematic, and we'd have to update the TDP MMU to deal with PAE paging (thanks
> NPT). All those problems are solvable, it's purely a matter of the ongoing costs
> to solve them.
>
> > I am asking because there was some talk to eliminate the direct mode from the
> > legacy non tdp mmu, which would simplify its code by a lot, but then it will
> > make 32 bit kernel fail back to shadowing mmu.
>
> Simplify which code? Between the nonpaging code and direct shadow pages in
> indirect MMUs, the vast majority of the "direct" support in the legacy MMU needs
> to be kept even if TDP support is dropped. And the really nasty stuff, e.g. PAE
> roots, would need to be carried over to the TDP MMU.
>

I guess this makes sense. I haven't researched the code well enough to know the exact answer.
I was just curious if this patch makes any difference :)

Thanks!

Best regards,
Maxim Levitsky