2022-12-19 15:47:09

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()


Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/slub_def.h | 12 ++-
mm/slab.h | 41 +++++++++++--
mm/slub.c | 146 ++++++++++++++++++++++++++++-------------------
3 files changed, 135 insertions(+), 64 deletions(-)

--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -39,15 +39,21 @@ enum stat_item {
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
- NR_SLUB_STAT_ITEMS };
+ NR_SLUB_STAT_ITEMS
+};

/*
* When changing the layout, make sure freelist and tid are still compatible
* with this_cpu_cmpxchg_double() alignment requirements.
*/
struct kmem_cache_cpu {
- void **freelist; /* Pointer to next available object */
- unsigned long tid; /* Globally unique transaction id */
+ union {
+ struct {
+ void **freelist; /* Pointer to next available object */
+ unsigned long tid; /* Globally unique transaction id */
+ };
+ freelist_aba_t freelist_tid;
+ };
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct slab *partial; /* Partially allocated frozen slabs */
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -5,6 +5,32 @@
* Internal slab definitions
*/

+/*
+ * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
+ * problems with cmpxchg of just a pointer.
+ */
+typedef union {
+ struct {
+ void *freelist;
+ unsigned long counter;
+ };
+#ifdef CONFIG_64BIT
+ u128 full;
+#else
+ u64 full;
+#endif
+} freelist_aba_t;
+
+#ifdef CONFIG_64BIT
+# ifdef system_has_cmpxchg128
+# define system_has_freelist_aba() system_has_cmpxchg128()
+# endif
+#else /* CONFIG_64BIT */
+# ifdef system_has_cmpxchg64
+# define system_has_freelist_aba() system_has_cmpxchg64()
+# endif
+#endif /* CONFIG_64BIT */
+
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;
@@ -34,14 +60,19 @@ struct slab {
};
struct kmem_cache *slab_cache;
/* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+ freelist_aba_t freelist_counter;
};
unsigned int __unused;

--- a/mm/slub.c
+++ b/mm/slub.c
@@ -280,7 +280,13 @@ static inline bool kmem_cache_has_cpu_pa
/* Poison object */
#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
/* Use cmpxchg_double */
+
+#if defined(system_has_freelist_aba) && \
+ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#else
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#endif

/*
* Tracking user of a slab.
@@ -496,6 +502,47 @@ static __always_inline void slab_unlock(
__bit_spin_unlock(PG_locked, &page->flags);
}

+static inline bool
+__update_freelist_fast(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+
+ bool ret = false;
+
+#ifdef system_has_freelist_aba
+ freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
+
+#ifdef CONFIG_64BIT
+ ret = try_cmpxchg128(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ ret = try_cmpxchg64(&slab->freelist_counter.full, &old.full, new.full);
+#endif
+#endif /* system_has_freelist_aba */
+
+ return ret;
+}
+
+static inline bool
+__update_freelist_slow(struct slab *slab,
+ void *freelist_old, unsigned long counters_old,
+ void *freelist_new, unsigned long counters_new)
+{
+ bool ret = false;
+
+ slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ ret = true;
+ }
+ slab_unlock(slab);
+
+ return ret;
+}
+
/*
* Interrupts must be disabled (for the fallback code to work right), typically
* by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
@@ -503,33 +550,25 @@ static __always_inline void slab_unlock(
* allocation/ free operation in hardirq context. Therefore nothing can
* interrupt the operation.
*/
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
+ bool ret;
+
if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
}
+ if (likely(ret))
+ return true;

cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -541,36 +580,26 @@ static inline bool __cmpxchg_double_slab
return false;
}

-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
+static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
- defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
+ bool ret;
+
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&slab->freelist, &slab->counters,
- freelist_old, counters_old,
- freelist_new, counters_new))
- return true;
- } else
-#endif
- {
+ ret = __update_freelist_fast(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
+ } else {
unsigned long flags;

local_irq_save(flags);
- slab_lock(slab);
- if (slab->freelist == freelist_old &&
- slab->counters == counters_old) {
- slab->freelist = freelist_new;
- slab->counters = counters_new;
- slab_unlock(slab);
- local_irq_restore(flags);
- return true;
- }
- slab_unlock(slab);
+ ret = __update_freelist_slow(slab, freelist_old, counters_old,
+ freelist_new, counters_new);
local_irq_restore(flags);
}
+ if (likely(ret))
+ return true;

cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
@@ -2168,7 +2197,7 @@ static inline void *acquire_slab(struct
VM_BUG_ON(new.frozen);
new.frozen = 1;

- if (!__cmpxchg_double_slab(s, slab,
+ if (!__slab_update_freelist(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
@@ -2500,7 +2529,7 @@ static void deactivate_slab(struct kmem_
}


- if (!cmpxchg_double_slab(s, slab,
+ if (!slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
@@ -2561,7 +2590,7 @@ static void __unfreeze_partials(struct k

new.frozen = 0;

- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
@@ -3022,7 +3051,7 @@ static inline void *get_freelist(struct
new.inuse = slab->objects;
new.frozen = freelist != NULL;

- } while (!__cmpxchg_double_slab(s, slab,
+ } while (!__slab_update_freelist(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -3295,6 +3324,18 @@ static __always_inline void maybe_wipe_o
0, sizeof(void *));
}

+static inline bool
+__update_cpu_freelist_fast(struct kmem_cache *s,
+ void *freelist_old, void *freelist_new,
+ unsigned long tid)
+{
+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
+
+ return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
+ old.full, new.full) == old.full;
+}
+
/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
@@ -3379,11 +3420,7 @@ static __always_inline void *slab_alloc_
* against code executing on this cpu *not* from access by
* other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- object, tid,
- next_object, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
@@ -3517,7 +3554,7 @@ static void __slab_free(struct kmem_cach
}
}

- } while (!cmpxchg_double_slab(s, slab,
+ } while (!slab_update_freelist(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3621,11 +3658,7 @@ static __always_inline void do_slab_free

set_freepointer(s, tail_obj, freelist);

- if (unlikely(!this_cpu_cmpxchg_double(
- s->cpu_slab->freelist, s->cpu_slab->tid,
- freelist, tid,
- head, next_tid(tid)))) {
-
+ if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
@@ -4319,11 +4352,12 @@ static int kmem_cache_open(struct kmem_c
}
}

-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
+#if defined(system_has_freelist_aba) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
+ if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
+ }
#endif

/*



2023-01-03 15:59:47

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On 12/19/22 16:35, Peter Zijlstra wrote:
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

LGTM and a nice cleanup also, thanks!
Acked-by: Vlastimil Babka <[email protected]>


2023-01-03 17:30:18

by Heiko Carstens

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/slub_def.h | 12 ++-
> mm/slab.h | 41 +++++++++++--
> mm/slub.c | 146 ++++++++++++++++++++++++++++-------------------
> 3 files changed, 135 insertions(+), 64 deletions(-)

Does this actually work? Just wondering since I end up with an instant
list corruption on s390. Might be endianness related, but I can't see
anything obvious at a first glance.

2023-01-03 19:13:05

by Linus Torvalds

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On Tue, Jan 3, 2023 at 9:17 AM Heiko Carstens <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > ---
> > include/linux/slub_def.h | 12 ++-
> > mm/slab.h | 41 +++++++++++--
> > mm/slub.c | 146 ++++++++++++++++++++++++++++-------------------
> > 3 files changed, 135 insertions(+), 64 deletions(-)
>
> Does this actually work? Just wondering since I end up with an instant
> list corruption on s390. Might be endianness related, but I can't see
> anything obvious at a first glance.

I don't see anything that looks related to endianness, because while
there is that 128-bit union member, it's always either used in full,
or it's accessed as other union members.

But I *do* note that this patch seems to be the only one that depends
on the new this_cpu_cmpxchg() updates to make it just automatically do
the right thing for a 128-bit value. And I have to admit that all
those games with __pcpu_cast_128() make no sense to me. Why isn't it
just using "u128" everywhere without any odd _Generic() games?

I could also easily see that if the asm constraints are wrong (like
the "cast pointer to (unsigned long *) instead of keeping it pointing
to a 128-bit type" thing discussed earlier), then code like this:

+ freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
+ freelist_aba_t new = { .freelist = freelist_new, .counter =
next_tid(tid) };
+
+ return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
+ old.full, new.full) == old.full;

would easily make the compiler go "the second word of 'old' is never
used by the asm, so I won't initialize it".

But yeah, that patch is hard to read, so hard to say. Does everything
leading up to it work fine?

Linus

2023-01-04 12:09:46

by Heiko Carstens

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On Tue, Jan 03, 2023 at 11:08:29AM -0800, Linus Torvalds wrote:
> On Tue, Jan 3, 2023 at 9:17 AM Heiko Carstens <[email protected]> wrote:
> >
> > On Mon, Dec 19, 2022 at 04:35:36PM +0100, Peter Zijlstra wrote:
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > > ---
> > > include/linux/slub_def.h | 12 ++-
> > > mm/slab.h | 41 +++++++++++--
> > > mm/slub.c | 146 ++++++++++++++++++++++++++++-------------------
> > > 3 files changed, 135 insertions(+), 64 deletions(-)
> >
> > Does this actually work? Just wondering since I end up with an instant
> > list corruption on s390. Might be endianness related, but I can't see
> > anything obvious at a first glance.
...
> the right thing for a 128-bit value. And I have to admit that all
> those games with __pcpu_cast_128() make no sense to me. Why isn't it
> just using "u128" everywhere without any odd _Generic() games?

That would have been my question as well, but the good thing is that
you pointed me to the percpu patch - Initially didn't expect any s390
specific code in there, but that is where the bug is.
I'll reply to that patch.

2023-01-09 16:32:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On Tue, Jan 03, 2023 at 11:08:29AM -0800, Linus Torvalds wrote:

> But I *do* note that this patch seems to be the only one that depends
> on the new this_cpu_cmpxchg() updates to make it just automatically do
> the right thing for a 128-bit value. And I have to admit that all
> those games with __pcpu_cast_128() make no sense to me. Why isn't it
> just using "u128" everywhere without any odd _Generic() games?

I ran into a ton of casting trouble when compiling kernel/fork.c which
uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
pointers to an integer that is not the exact same size.

2023-01-09 22:28:24

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On January 9, 2023 2:02:33 PM PST, Linus Torvalds <[email protected]> wrote:
>On Mon, Jan 9, 2023 at 10:29 AM Peter Zijlstra <[email protected]> wrote:
>>
>> I ran into a ton of casting trouble when compiling kernel/fork.c which
>> uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
>> pointers to an integer that is not the exact same size.
>
>Ahh. Yeah - not because that code needs or wants the 128-bit case, but
>because the macro expands to all sizes in a switch statement, so the
>compiler sees all the cases even if only one is then statically
>picked.
>
>So the silly casts are for all the cases that never matter.
>
>Annoying.
>
>I wonder if the "this_cpu_cmpxchg()" macro could be made to use
>_Generic() to pick out the pointer case first, and then only use
>'sizeof()' for the integer types, so that we don't have this kind of
>"every architecture needs to deal with the nasty situation" code.
>
>Ok, it's not actually the this_cpu_cmpxchg() macro, it's
>__pcpu_size_call_return() and friends, but whatever.
>
>Another alternative is to try to avoid casting to "u64" as long as
>humanly possible, and use only "typeof((*ptr))" everywhere. Then when
>the type actually *is* 128-bit, it all works out fine, because it
>won't be a pointer. That's the approach the uaccess macros tend to
>take, and then they hit the reverse issue on clang, where using the
>"byte register" constraints would cause warnings for non-byte
>accesses, and we had to do
>
> unsigned char x_u8__;
> __get_user_asm(x_u8__, ptr, "b", "=q", label);
> (x) = x_u8__;
>
>because using '(x)' directly would then warn when 'x' wasn't a
>char-sized thing - even if that asm case never actually was _used_ for
>that case, since it was all inside a "switch (sizeof) case 1:"
>statement.
>
> Linus

I wrote a crazy macro for dealing with exactly this at one point, basically producing the "right type" to cast to. It would need to have 128-bit support added to it, but that should be trivial. It is called something like int_type() ... not in front of a computer right now so can't double check.

2023-01-10 02:12:47

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On 1/9/23 14:22, H. Peter Anvin wrote:
>>
>> Another alternative is to try to avoid casting to "u64" as long as
>> humanly possible, and use only "typeof((*ptr))" everywhere. Then when
>> the type actually *is* 128-bit, it all works out fine, because it
>> won't be a pointer. That's the approach the uaccess macros tend to
>> take, and then they hit the reverse issue on clang, where using the
>> "byte register" constraints would cause warnings for non-byte
>> accesses, and we had to do
>>
>> unsigned char x_u8__;
>> __get_user_asm(x_u8__, ptr, "b", "=q", label);
>> (x) = x_u8__;
>>
>> because using '(x)' directly would then warn when 'x' wasn't a
>> char-sized thing - even if that asm case never actually was _used_ for
>> that case, since it was all inside a "switch (sizeof) case 1:"
>> statement.
>>
>> Linus
>

> I wrote a crazy macro for dealing with exactly this at one point,
> basically producing the "right type" to cast to. It would need to
> have 128-bit support added to it, but that should be trivial. It is
> called something like int_type() ... not in front of a computer right
> now so can't double check.
Right, it is called __inttype and is defined in
arch/x86/include/asm/uaccess.h (and, apparently, a few other
architectures; probably should be centralized.)

It has been rewritten since my first version using a nice little macro
called __typefits, also would we worth centralizing.

-hpa

2023-01-10 10:36:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 11/12] slub: Replace cmpxchg_double()

On Mon, Jan 09, 2023 at 04:02:33PM -0600, Linus Torvalds wrote:
> On Mon, Jan 9, 2023 at 10:29 AM Peter Zijlstra <[email protected]> wrote:
> >
> > I ran into a ton of casting trouble when compiling kernel/fork.c which
> > uses this_cpu_cmpxchg() on a pointer type and the compiler hates casting
> > pointers to an integer that is not the exact same size.
>
> Ahh. Yeah - not because that code needs or wants the 128-bit case, but
> because the macro expands to all sizes in a switch statement, so the
> compiler sees all the cases even if only one is then statically
> picked.
>
> So the silly casts are for all the cases that never matter.
>
> Annoying.

Yes, very.

This seems to compile (and boot). Let me go update the others and push
it out for the robots to have a go.

#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
({ \
union { \
typeof(_var) full; \
struct { \
u64 low, high; \
}; \
} old__, new__; \
\
old__.full = _oval; \
new__.full = _nval; \
\
asm qual ("cmpxchg16b " __percpu_arg([var]) \
: [var] "+m" (_var), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
"c" (new__.high) \
: "memory"); \
\
old__.full; \
})