2024-04-15 15:21:32

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 0/2] sched: rseq mm_cid updates

Hi,

This patch series consists of 2 patches. It is based on v6.9-rc4.

- A fix aiming for v6.9-rc (to be backported to stable kernels):
"sched: Add missing memory barrier in switch_mm_cid"

- An improvement patch aiming for v6.10:
"sched: Move mm_cid code from sched.h to core.c"

Thanks,

Mathieu

Cc: Steven Rostedt <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: levi.yun <[email protected]>
Cc: Mathieu Desnoyers <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Aaron Lu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]

Mathieu Desnoyers (2):
sched: Add missing memory barrier in switch_mm_cid
sched: Move mm_cid code from sched.h to core.c

arch/x86/include/asm/barrier.h | 3 +
include/asm-generic/barrier.h | 8 +
kernel/sched/core.c | 277 ++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 237 +---------------------------
4 files changed, 270 insertions(+), 255 deletions(-)

--
2.39.2


2024-04-15 15:22:28

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 2/2] sched: Move mm_cid code from sched.h to core.c

The mm_cid code in sched/sched.h is only used from sched/core.c. Move it
to the compile unit where it belongs.

While reviewing mm_cid functions which were already in sched/core.c, I
noticed that a few of them are non-static even though they are only used
from core.c. Make those functions static inline. For sake of keeping
things consistent, mm_cid functions only marked "static" are now marked
"static inline". The variables cid_lock and use_cid_lock are only used
from core.c, mark them as static.

Moving from non-static to static inline for:

- sched_mm_cid_migrate_from
- init_sched_mm_cid
- task_tick_mm_cid

And the forced inlining of:

- __sched_mm_cid_migrate_from_fetch_cid
- __sched_mm_cid_migrate_from_try_steal_cid
- sched_mm_cid_migrate_to
- sched_mm_cid_remote_clear
- sched_mm_cid_remote_clear_old
- sched_mm_cid_remote_clear_weight

slightly improves the size of sched/core.o on x86-64 (in bytes):

text data
before: 192261 58677
after: 191629 58641
-----------------------------
delta: -632 -36

Signed-off-by: Mathieu Desnoyers <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
kernel/sched/core.c | 277 +++++++++++++++++++++++++++++++++++++++----
kernel/sched/sched.h | 241 -------------------------------------
2 files changed, 257 insertions(+), 261 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7019a40457a6..57b03d874530 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -457,6 +457,22 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }

#endif /* CONFIG_SCHED_CORE */

+#ifdef CONFIG_SCHED_MM_CID
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next);
+static inline void sched_mm_cid_migrate_from(struct task_struct *t);
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
+static inline void init_sched_mm_cid(struct task_struct *t);
+#else
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next) { }
+static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+static inline void init_sched_mm_cid(struct task_struct *t) { }
+#endif
+
/*
* Serialization rules:
*
@@ -11551,6 +11567,9 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)

#ifdef CONFIG_SCHED_MM_CID

+#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
+#define MM_CID_SCAN_DELAY 100 /* 100ms */
+
/*
* @cid_lock: Guarantee forward-progress of cid allocation.
*
@@ -11558,7 +11577,7 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
* is only used when contention is detected by the lock-free allocation so
* forward progress can be guaranteed.
*/
-DEFINE_RAW_SPINLOCK(cid_lock);
+static DEFINE_RAW_SPINLOCK(cid_lock);

/*
* @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
@@ -11569,7 +11588,7 @@ DEFINE_RAW_SPINLOCK(cid_lock);
* completes and sets @use_cid_lock back to 0. This guarantees forward progress
* of a cid allocation.
*/
-int use_cid_lock;
+static int use_cid_lock;

/*
* mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
@@ -11659,15 +11678,233 @@ int use_cid_lock;
* because this would UNSET a cid which is actively used.
*/

-void sched_mm_cid_migrate_from(struct task_struct *t)
+static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+{
+ if (cid < 0)
+ return;
+ cpumask_clear_cpu(cid, mm_cidmask(mm));
+}
+
+/*
+ * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
+ * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
+ * be held to transition to other states.
+ *
+ * State transitions synchronized with cmpxchg or try_cmpxchg need to be
+ * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ */
+static inline void mm_cid_put_lazy(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (!mm_cid_is_lazy_put(cid) ||
+ !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid, res;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ for (;;) {
+ if (mm_cid_is_unset(cid))
+ return MM_CID_UNSET;
+ /*
+ * Attempt transition from valid or lazy-put to unset.
+ */
+ res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
+ if (res == cid)
+ break;
+ cid = res;
+ }
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm)
+{
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = mm_cid_pcpu_unset(mm);
+ if (cid == MM_CID_UNSET)
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int __mm_cid_try_get(struct mm_struct *mm)
+{
+ struct cpumask *cpumask;
+ int cid;
+
+ cpumask = mm_cidmask(mm);
+ /*
+ * Retry finding first zero bit if the mask is temporarily
+ * filled. This only happens during concurrent remote-clear
+ * which owns a cid without holding a rq lock.
+ */
+ for (;;) {
+ cid = cpumask_first_zero(cpumask);
+ if (cid < nr_cpu_ids)
+ break;
+ cpu_relax();
+ }
+ if (cpumask_test_and_set_cpu(cid, cpumask))
+ return -1;
+ return cid;
+}
+
+/*
+ * Save a snapshot of the current runqueue time of this cpu
+ * with the per-cpu cid value, allowing to estimate how recently it was used.
+ */
+static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+ WRITE_ONCE(pcpu_cid->time, rq->clock);
+}
+
+static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
+{
+ int cid;
+
+ /*
+ * All allocations (even those using the cid_lock) are lock-free. If
+ * use_cid_lock is set, hold the cid_lock to perform cid allocation to
+ * guarantee forward progress.
+ */
+ if (!READ_ONCE(use_cid_lock)) {
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto end;
+ raw_spin_lock(&cid_lock);
+ } else {
+ raw_spin_lock(&cid_lock);
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto unlock;
+ }
+
+ /*
+ * cid concurrently allocated. Retry while forcing following
+ * allocations to use the cid_lock to ensure forward progress.
+ */
+ WRITE_ONCE(use_cid_lock, 1);
+ /*
+ * Set use_cid_lock before allocation. Only care about program order
+ * because this is only required for forward progress.
+ */
+ barrier();
+ /*
+ * Retry until it succeeds. It is guaranteed to eventually succeed once
+ * all newcoming allocations observe the use_cid_lock flag set.
+ */
+ do {
+ cid = __mm_cid_try_get(mm);
+ cpu_relax();
+ } while (cid < 0);
+ /*
+ * Allocate before clearing use_cid_lock. Only care about
+ * program order because this is for forward progress.
+ */
+ barrier();
+ WRITE_ONCE(use_cid_lock, 0);
+unlock:
+ raw_spin_unlock(&cid_lock);
+end:
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+}
+
+static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ struct cpumask *cpumask;
+ int cid;
+
+ lockdep_assert_rq_held(rq);
+ cpumask = mm_cidmask(mm);
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (mm_cid_is_valid(cid)) {
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+ }
+ if (mm_cid_is_lazy_put(cid)) {
+ if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ }
+ cid = __mm_cid_get(rq, mm);
+ __this_cpu_write(pcpu_cid->cid, cid);
+ return cid;
+}
+
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ /*
+ * Provide a memory barrier between rq->curr store and load of
+ * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
+ *
+ * Should be adapted if context_switch() is modified.
+ */
+ if (!next->mm) { // to kernel
+ /*
+ * user -> kernel transition does not guarantee a barrier, but
+ * we can use the fact that it performs an atomic operation in
+ * mmgrab().
+ */
+ if (prev->mm) // from user
+ smp_mb__after_mmgrab();
+ /*
+ * kernel -> kernel transition does not change rq->curr->mm
+ * state. It stays NULL.
+ */
+ } else { // to user
+ /*
+ * kernel -> user transition does not provide a barrier
+ * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
+ * Provide it here.
+ */
+ if (!prev->mm) { // from kernel
+ smp_mb();
+ } else { // from user
+ /*
+ * user -> user transition relies on an implicit
+ * memory barrier in switch_mm() when
+ * current->mm changes. If the architecture
+ * switch_mm() does not have an implicit memory
+ * barrier, it is emitted here. If current->mm
+ * is unchanged, no barrier is needed.
+ */
+ smp_mb__after_switch_mm();
+ }
+ }
+ if (prev->mm_cid_active) {
+ mm_cid_snapshot_time(rq, prev->mm);
+ mm_cid_put_lazy(prev);
+ prev->mm_cid = -1;
+ }
+ if (next->mm_cid_active)
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
+}
+
+static inline void sched_mm_cid_migrate_from(struct task_struct *t)
{
t->migrate_from_cpu = task_cpu(t);
}

-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
+static inline int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
{
struct mm_struct *mm = t->mm;
struct task_struct *src_task;
@@ -11703,11 +11940,10 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
return src_cid;
}

-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
+static inline int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
{
struct task_struct *src_task;
struct mm_struct *mm = t->mm;
@@ -11767,7 +12003,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
* Interrupts are disabled, which keeps the window of cid ownership without the
* source rq lock held small.
*/
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
{
struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
struct mm_struct *mm = t->mm;
@@ -11820,8 +12056,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
}

-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
+static inline void sched_mm_cid_remote_clear(struct mm_struct *mm,
+ struct mm_cid *pcpu_cid,
+ int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
@@ -11876,7 +12113,7 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_
}
}

-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+static inline void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct mm_cid *pcpu_cid;
@@ -11908,8 +12145,8 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}

-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
+static inline void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+ int weight)
{
struct mm_cid *pcpu_cid;
int cid;
@@ -11965,7 +12202,7 @@ static void task_mm_cid_work(struct callback_head *work)
sched_mm_cid_remote_clear_weight(mm, cpu, weight);
}

-void init_sched_mm_cid(struct task_struct *t)
+static inline void init_sched_mm_cid(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
int mm_users = 0;
@@ -11979,7 +12216,7 @@ void init_sched_mm_cid(struct task_struct *t)
init_task_work(&t->cid_work, task_mm_cid_work);
}

-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->cid_work;
unsigned long now = jiffies;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2895d264196..1b8e3e23ef40 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3237,247 +3237,6 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif

-#ifdef CONFIG_SCHED_MM_CID
-
-#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
-#define MM_CID_SCAN_DELAY 100 /* 100ms */
-
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
-
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
-
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
-{
- if (cid < 0)
- return;
- cpumask_clear_cpu(cid, mm_cidmask(mm));
-}
-
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across cpus, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
-
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- if (!mm_cid_is_lazy_put(cid) ||
- !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
-}
-
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
-{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, res;
-
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- for (;;) {
- if (mm_cid_is_unset(cid))
- return MM_CID_UNSET;
- /*
- * Attempt transition from valid or lazy-put to unset.
- */
- res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
- if (res == cid)
- break;
- cid = res;
- }
- return cid;
-}
-
-static inline void mm_cid_put(struct mm_struct *mm)
-{
- int cid;
-
- lockdep_assert_irqs_disabled();
- cid = mm_cid_pcpu_unset(mm);
- if (cid == MM_CID_UNSET)
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
-}
-
-static inline int __mm_cid_try_get(struct mm_struct *mm)
-{
- struct cpumask *cpumask;
- int cid;
-
- cpumask = mm_cidmask(mm);
- /*
- * Retry finding first zero bit if the mask is temporarily
- * filled. This only happens during concurrent remote-clear
- * which owns a cid without holding a rq lock.
- */
- for (;;) {
- cid = cpumask_first_zero(cpumask);
- if (cid < nr_cpu_ids)
- break;
- cpu_relax();
- }
- if (cpumask_test_and_set_cpu(cid, cpumask))
- return -1;
- return cid;
-}
-
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
-{
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
-
- lockdep_assert_rq_held(rq);
- WRITE_ONCE(pcpu_cid->time, rq->clock);
-}
-
-static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
-{
- int cid;
-
- /*
- * All allocations (even those using the cid_lock) are lock-free. If
- * use_cid_lock is set, hold the cid_lock to perform cid allocation to
- * guarantee forward progress.
- */
- if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(mm);
- if (cid >= 0)
- goto end;
- raw_spin_lock(&cid_lock);
- } else {
- raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(mm);
- if (cid >= 0)
- goto unlock;
- }
-
- /*
- * cid concurrently allocated. Retry while forcing following
- * allocations to use the cid_lock to ensure forward progress.
- */
- WRITE_ONCE(use_cid_lock, 1);
- /*
- * Set use_cid_lock before allocation. Only care about program order
- * because this is only required for forward progress.
- */
- barrier();
- /*
- * Retry until it succeeds. It is guaranteed to eventually succeed once
- * all newcoming allocations observe the use_cid_lock flag set.
- */
- do {
- cid = __mm_cid_try_get(mm);
- cpu_relax();
- } while (cid < 0);
- /*
- * Allocate before clearing use_cid_lock. Only care about
- * program order because this is for forward progress.
- */
- barrier();
- WRITE_ONCE(use_cid_lock, 0);
-unlock:
- raw_spin_unlock(&cid_lock);
-end:
- mm_cid_snapshot_time(rq, mm);
- return cid;
-}
-
-static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
-{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- struct cpumask *cpumask;
- int cid;
-
- lockdep_assert_rq_held(rq);
- cpumask = mm_cidmask(mm);
- cid = __this_cpu_read(pcpu_cid->cid);
- if (mm_cid_is_valid(cid)) {
- mm_cid_snapshot_time(rq, mm);
- return cid;
- }
- if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
- }
- cid = __mm_cid_get(rq, mm);
- __this_cpu_write(pcpu_cid->cid, cid);
- return cid;
-}
-
-static inline void switch_mm_cid(struct rq *rq,
- struct task_struct *prev,
- struct task_struct *next)
-{
- /*
- * Provide a memory barrier between rq->curr store and load of
- * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
- *
- * Should be adapted if context_switch() is modified.
- */
- if (!next->mm) { // to kernel
- /*
- * user -> kernel transition does not guarantee a barrier, but
- * we can use the fact that it performs an atomic operation in
- * mmgrab().
- */
- if (prev->mm) // from user
- smp_mb__after_mmgrab();
- /*
- * kernel -> kernel transition does not change rq->curr->mm
- * state. It stays NULL.
- */
- } else { // to user
- /*
- * kernel -> user transition does not provide a barrier
- * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
- * Provide it here.
- */
- if (!prev->mm) { // from kernel
- smp_mb();
- } else { // from user
- /*
- * user -> user transition relies on an implicit
- * memory barrier in switch_mm() when
- * current->mm changes. If the architecture
- * switch_mm() does not have an implicit memory
- * barrier, it is emitted here. If current->mm
- * is unchanged, no barrier is needed.
- */
- smp_mb__after_switch_mm();
- }
- }
- if (prev->mm_cid_active) {
- mm_cid_snapshot_time(rq, prev->mm);
- mm_cid_put_lazy(prev);
- prev->mm_cid = -1;
- }
- if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
-}
-
-#else
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
-static inline void init_sched_mm_cid(struct task_struct *t) { }
-#endif
-
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);

--
2.39.2


2024-04-15 15:25:09

by Mathieu Desnoyers

[permalink] [raw]
Subject: [PATCH 1/2] sched: Add missing memory barrier in switch_mm_cid

Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb()
which the core scheduler code has depended upon since commit:

commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid")

If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can
unset the actively used cid when it fails to observe active task after it
sets lazy_put.

There *is* a memory barrier between storing to rq->curr and _return to
userspace_ (as required by membarrier), but the rseq mm_cid has stricter
requirements: the barrier needs to be issued between store to rq->curr
and switch_mm_cid(), which happens earlier than:

- spin_unlock(),
- switch_to().

So it's fine when the architecture switch_mm() happens to have that
barrier already, but less so when the architecture only provides the
full barrier in switch_to() or spin_unlock().

It is a bug in the rseq switch_mm_cid() implementation. All architectures
that don't have memory barriers in switch_mm(), but rather have the full
barrier either in finish_lock_switch() or switch_to() have them too late
for the needs of switch_mm_cid().

Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the
generic barrier.h header, and use it in switch_mm_cid() for scheduler
transitions where switch_mm() is expected to provide a memory barrier.

Architectures can override smp_mb__after_switch_mm() if their
switch_mm() implementation provides an implicit memory barrier.
Override it with a no-op on x86 which implicitly provide this memory
barrier by writing to CR3.

Link: https://lore.kernel.org/lkml/[email protected]/
Reported-by: levi.yun <[email protected]>
Signed-off-by: Mathieu Desnoyers <[email protected]>
Reviewed-by: Catalin Marinas <[email protected]> # for arm64
Acked-by: Dave Hansen <[email protected]> # for x86
Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Cc: <[email protected]> # 6.4.x
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Ben Segall <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: levi.yun <[email protected]>
Cc: Mathieu Desnoyers <[email protected]>
Cc: Catalin Marinas <[email protected]>
Cc: Mark Rutland <[email protected]>
Cc: Will Deacon <[email protected]>
Cc: Aaron Lu <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
arch/x86/include/asm/barrier.h | 3 +++
include/asm-generic/barrier.h | 8 ++++++++
kernel/sched/sched.h | 20 ++++++++++++++------
3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index fe1e7e3cc844..63bdc6b85219 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -79,6 +79,9 @@ do { \
#define __smp_mb__before_atomic() do { } while (0)
#define __smp_mb__after_atomic() do { } while (0)

+/* Writing to CR3 provides a full memory barrier in switch_mm(). */
+#define smp_mb__after_switch_mm() do { } while (0)
+
#include <asm-generic/barrier.h>

#endif /* _ASM_X86_BARRIER_H */
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 0c0695763bea..dc32b96140c1 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -294,5 +294,13 @@ do { \
#define io_stop_wc() do { } while (0)
#endif

+/*
+ * Architectures that guarantee an implicit smp_mb() in switch_mm()
+ * can override smp_mb__after_switch_mm.
+ */
+#ifndef smp_mb__after_switch_mm
+#define smp_mb__after_switch_mm() smp_mb()
+#endif
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d2242679239e..d2895d264196 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -79,6 +79,8 @@
# include <asm/paravirt_api_clock.h>
#endif

+#include <asm/barrier.h>
+
#include "cpupri.h"
#include "cpudeadline.h"

@@ -3445,13 +3447,19 @@ static inline void switch_mm_cid(struct rq *rq,
* between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
* Provide it here.
*/
- if (!prev->mm) // from kernel
+ if (!prev->mm) { // from kernel
smp_mb();
- /*
- * user -> user transition guarantees a memory barrier through
- * switch_mm() when current->mm changes. If current->mm is
- * unchanged, no barrier is needed.
- */
+ } else { // from user
+ /*
+ * user -> user transition relies on an implicit
+ * memory barrier in switch_mm() when
+ * current->mm changes. If the architecture
+ * switch_mm() does not have an implicit memory
+ * barrier, it is emitted here. If current->mm
+ * is unchanged, no barrier is needed.
+ */
+ smp_mb__after_switch_mm();
+ }
}
if (prev->mm_cid_active) {
mm_cid_snapshot_time(rq, prev->mm);
--
2.39.2


2024-04-16 12:22:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 0/2] sched: rseq mm_cid updates


* Mathieu Desnoyers <[email protected]> wrote:

> Hi,
>
> This patch series consists of 2 patches. It is based on v6.9-rc4.
>
> - A fix aiming for v6.9-rc (to be backported to stable kernels):
> "sched: Add missing memory barrier in switch_mm_cid"
>
> - An improvement patch aiming for v6.10:
> "sched: Move mm_cid code from sched.h to core.c"
>
> Thanks,
>
> Mathieu
>
> Cc: Steven Rostedt <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Juri Lelli <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Ben Segall <[email protected]>
> Cc: Mel Gorman <[email protected]>
> Cc: Daniel Bristot de Oliveira <[email protected]>
> Cc: Valentin Schneider <[email protected]>
> Cc: levi.yun <[email protected]>
> Cc: Mathieu Desnoyers <[email protected]>
> Cc: Catalin Marinas <[email protected]>
> Cc: Mark Rutland <[email protected]>
> Cc: Will Deacon <[email protected]>
> Cc: Aaron Lu <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Borislav Petkov <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: "H. Peter Anvin" <[email protected]>
> Cc: Arnd Bergmann <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
>
> Mathieu Desnoyers (2):
> sched: Add missing memory barrier in switch_mm_cid
> sched: Move mm_cid code from sched.h to core.c

I've applied the first patch to tip:sched/urgent, and will queue up the
code movement patch in the scheduler tree for v6.10, thanks Mathieu!

Ingo

Subject: [tip: sched/urgent] sched: Add missing memory barrier in switch_mm_cid

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: fe90f3967bdb3e13f133e5f44025e15f943a99c5
Gitweb: https://git.kernel.org/tip/fe90f3967bdb3e13f133e5f44025e15f943a99c5
Author: Mathieu Desnoyers <[email protected]>
AuthorDate: Mon, 15 Apr 2024 11:21:13 -04:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Tue, 16 Apr 2024 13:59:45 +02:00

sched: Add missing memory barrier in switch_mm_cid

Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb()
which the core scheduler code has depended upon since commit:

commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid")

If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can
unset the actively used cid when it fails to observe active task after it
sets lazy_put.

There *is* a memory barrier between storing to rq->curr and _return to
userspace_ (as required by membarrier), but the rseq mm_cid has stricter
requirements: the barrier needs to be issued between store to rq->curr
and switch_mm_cid(), which happens earlier than:

- spin_unlock(),
- switch_to().

So it's fine when the architecture switch_mm() happens to have that
barrier already, but less so when the architecture only provides the
full barrier in switch_to() or spin_unlock().

It is a bug in the rseq switch_mm_cid() implementation. All architectures
that don't have memory barriers in switch_mm(), but rather have the full
barrier either in finish_lock_switch() or switch_to() have them too late
for the needs of switch_mm_cid().

Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the
generic barrier.h header, and use it in switch_mm_cid() for scheduler
transitions where switch_mm() is expected to provide a memory barrier.

Architectures can override smp_mb__after_switch_mm() if their
switch_mm() implementation provides an implicit memory barrier.
Override it with a no-op on x86 which implicitly provide this memory
barrier by writing to CR3.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Reported-by: levi.yun <[email protected]>
Signed-off-by: Mathieu Desnoyers <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Reviewed-by: Catalin Marinas <[email protected]> # for arm64
Acked-by: Dave Hansen <[email protected]> # for x86
Cc: <[email protected]> # 6.4.x
Cc: Linus Torvalds <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/barrier.h | 3 +++
include/asm-generic/barrier.h | 8 ++++++++
kernel/sched/sched.h | 20 ++++++++++++++------
3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index fe1e7e3..63bdc6b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -79,6 +79,9 @@ do { \
#define __smp_mb__before_atomic() do { } while (0)
#define __smp_mb__after_atomic() do { } while (0)

+/* Writing to CR3 provides a full memory barrier in switch_mm(). */
+#define smp_mb__after_switch_mm() do { } while (0)
+
#include <asm-generic/barrier.h>

#endif /* _ASM_X86_BARRIER_H */
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 0c06957..d4f581c 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -294,5 +294,13 @@ do { \
#define io_stop_wc() do { } while (0)
#endif

+/*
+ * Architectures that guarantee an implicit smp_mb() in switch_mm()
+ * can override smp_mb__after_switch_mm.
+ */
+#ifndef smp_mb__after_switch_mm
+# define smp_mb__after_switch_mm() smp_mb()
+#endif
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_GENERIC_BARRIER_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d224267..ae50f21 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -79,6 +79,8 @@
# include <asm/paravirt_api_clock.h>
#endif

+#include <asm/barrier.h>
+
#include "cpupri.h"
#include "cpudeadline.h"

@@ -3445,13 +3447,19 @@ static inline void switch_mm_cid(struct rq *rq,
* between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
* Provide it here.
*/
- if (!prev->mm) // from kernel
+ if (!prev->mm) { // from kernel
smp_mb();
- /*
- * user -> user transition guarantees a memory barrier through
- * switch_mm() when current->mm changes. If current->mm is
- * unchanged, no barrier is needed.
- */
+ } else { // from user
+ /*
+ * user->user transition relies on an implicit
+ * memory barrier in switch_mm() when
+ * current->mm changes. If the architecture
+ * switch_mm() does not have an implicit memory
+ * barrier, it is emitted here. If current->mm
+ * is unchanged, no barrier is needed.
+ */
+ smp_mb__after_switch_mm();
+ }
}
if (prev->mm_cid_active) {
mm_cid_snapshot_time(rq, prev->mm);

2024-04-16 14:36:58

by Mathieu Desnoyers

[permalink] [raw]
Subject: Re: [PATCH 0/2] sched: rseq mm_cid updates

On 2024-04-16 08:21, Ingo Molnar wrote:
[...]
>>
>> Mathieu Desnoyers (2):
>> sched: Add missing memory barrier in switch_mm_cid
>> sched: Move mm_cid code from sched.h to core.c
>
> I've applied the first patch to tip:sched/urgent, and will queue up the
> code movement patch in the scheduler tree for v6.10, thanks Mathieu!

Thanks for reviewing and queuing those patches!

Mathieu


--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com