The active load balance has a known issue[1][2] that there is a race
window between waking up the migration thread on the busiest CPU and it
begins to preempt the current running CFS task. This race window may cause
unexpected behavior that the current running CFS task may be preempted
by a RT task first, and then the RT task will be preempted by this
waked migration thread. Per our tracing, the latency caused by this
preemption can be greater than 1ms, which is not a small latency for the
RT tasks.
We'd better set a proper priority to this balance work so that it can
preempt CFS task only. A new per-cpu thread cfs_migration is introduced
for this purpose. The cfs_migration thread has a priority FIFO-1,
which means it can preempt any cfs tasks but can't preempt other FIFO
tasks.
Besides the active load balance work, the numa balance work also applies
to CFS tasks only. So we'd better assign cfs_migraion to numa balance
work as well.
[1]. https://lore.kernel.org/lkml/CAKfTPtBygNcVewbb0GQOP5xxO96am3YeTZNP5dK9BxKHJJAL-g@mail.gmail.com/
[2]. https://lore.kernel.org/lkml/[email protected]/
Yafang Shao (4):
stop_machine: Move cpu_stop_done into stop_machine.h
sched/fair: Introduce cfs_migration
sched/fair: Do active load balance in cfs_migration
sched/core: Do numa balance in cfs_migration
include/linux/stop_machine.h | 12 +++
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 143 ++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 2 +
kernel/stop_machine.c | 14 +---
5 files changed, 158 insertions(+), 15 deletions(-)
--
2.17.1
Move struct cpu_stop_done into the stop_machine.h, then it can be resued
by the functions outside of stop_maichine.c.
Signed-off-by: Yafang Shao <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
---
include/linux/stop_machine.h | 12 ++++++++++++
kernel/stop_machine.c | 14 ++------------
2 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 46fb3ebdd16e..b1234cb6ab70 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -20,6 +20,15 @@
typedef int (*cpu_stop_fn_t)(void *arg);
#ifdef CONFIG_SMP
+/*
+ * Structure to determine completion condition and record errors. May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+ atomic_t nr_todo; /* nr left to execute */
+ int ret; /* collected return value */
+ struct completion completion; /* fired if nr_todo reaches 0 */
+};
struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */
@@ -29,6 +38,9 @@ struct cpu_stop_work {
struct cpu_stop_done *done;
};
+void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo);
+void cpu_stop_signal_done(struct cpu_stop_done *done);
+
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg);
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index cbc30271ea4d..cc94eb7d2c5c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -23,16 +23,6 @@
#include <linux/nmi.h>
#include <linux/sched/wake_q.h>
-/*
- * Structure to determine completion condition and record errors. May
- * be shared by works on different cpus.
- */
-struct cpu_stop_done {
- atomic_t nr_todo; /* nr left to execute */
- int ret; /* collected return value */
- struct completion completion; /* fired if nr_todo reaches 0 */
-};
-
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
struct task_struct *thread;
@@ -67,7 +57,7 @@ void print_stop_info(const char *log_lvl, struct task_struct *task)
static DEFINE_MUTEX(stop_cpus_mutex);
static bool stop_cpus_in_progress;
-static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
atomic_set(&done->nr_todo, nr_todo);
@@ -75,7 +65,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
}
/* signal completion unless @done is NULL */
-static void cpu_stop_signal_done(struct cpu_stop_done *done)
+void cpu_stop_signal_done(struct cpu_stop_done *done)
{
if (atomic_dec_and_test(&done->nr_todo))
complete(&done->completion);
--
2.17.1
A new per-cpu kthread named "cfs_migration/N" is introduced to do
cfs specific balance works. It is a FIFO task with priority FIFO-1,
which means it can preempt any cfs tasks but can't preempt other FIFO
tasks. The kthreads as follows,
PID COMMAND
13 [cfs_migration/0]
20 [cfs_migration/1]
25 [cfs_migration/2]
32 [cfs_migration/3]
38 [cfs_migration/4]
...
$ cat /proc/13/sched
...
policy : 1
prio : 98
...
$ cat /proc/13/status
...
Cpus_allowed: 0001
Cpus_allowed_list: 0
...
All the works need to be done will be queued into a singly linked list,
in which the first queued will be first serviced.
Signed-off-by: Yafang Shao <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
---
kernel/sched/fair.c | 93 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 93 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87db481e8a56..56b3fa91828b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,6 +20,8 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/smpboot.h>
+#include <linux/stop_machine.h>
#include "sched.h"
/*
@@ -11915,3 +11917,94 @@ int sched_trace_rq_nr_running(struct rq *rq)
return rq ? rq->nr_running : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
+
+#ifdef CONFIG_SMP
+struct cfs_migrater {
+ struct task_struct *thread;
+ struct list_head works;
+ raw_spinlock_t lock;
+};
+
+DEFINE_PER_CPU(struct cfs_migrater, cfs_migrater);
+
+static int cfs_migration_should_run(unsigned int cpu)
+{
+ struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
+ unsigned long flags;
+ int run;
+
+ raw_spin_lock_irqsave(&migrater->lock, flags);
+ run = !list_empty(&migrater->works);
+ raw_spin_unlock_irqrestore(&migrater->lock, flags);
+
+ return run;
+}
+
+static void cfs_migration_setup(unsigned int cpu)
+{
+ /* cfs_migration should have a higher priority than normal tasks,
+ * but a lower priority than other FIFO tasks.
+ */
+ sched_set_fifo_low(current);
+}
+
+static void cfs_migrater_thread(unsigned int cpu)
+{
+ struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
+ struct cpu_stop_work *work;
+
+repeat:
+ work = NULL;
+ raw_spin_lock_irq(&migrater->lock);
+ if (!list_empty(&migrater->works)) {
+ work = list_first_entry(&migrater->works,
+ struct cpu_stop_work, list);
+ list_del_init(&work->list);
+ }
+ raw_spin_unlock_irq(&migrater->lock);
+
+ if (work) {
+ struct cpu_stop_done *done = work->done;
+ cpu_stop_fn_t fn = work->fn;
+ void *arg = work->arg;
+ int ret;
+
+ preempt_count_inc();
+ ret = fn(arg);
+ if (done) {
+ if (ret)
+ done->ret = ret;
+ cpu_stop_signal_done(done);
+ }
+ preempt_count_dec();
+ goto repeat;
+ }
+}
+
+static struct smp_hotplug_thread cfs_migration_threads = {
+ .store = &cfs_migrater.thread,
+ .setup = cfs_migration_setup,
+ .thread_fn = cfs_migrater_thread,
+ .thread_comm = "cfs_migration/%u",
+ .thread_should_run = cfs_migration_should_run,
+};
+
+static int __init cfs_migration_init(void)
+{
+ struct cfs_migrater *cm = &per_cpu(cfs_migrater, raw_smp_processor_id());
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
+
+ raw_spin_lock_init(&migrater->lock);
+ INIT_LIST_HEAD(&migrater->works);
+ }
+
+ WARN_ON_ONCE(smpboot_register_percpu_thread(&cfs_migration_threads));
+ kthread_unpark(cm->thread);
+
+ return 0;
+}
+early_initcall(cfs_migration_init)
+#endif
--
2.17.1
Similar to active load balance, the numa balance work is also applied to
cfs tasks only and it should't preempt other FIFO tasks. We'd better assign
cfs_migration to the numa balance as well.
Signed-off-by: Yafang Shao <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
---
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 13 +++++++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cb81ef8acc8..4a37b06715f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8724,7 +8724,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
/* TODO: This is not properly updating schedstats */
trace_sched_move_numa(p, curr_cpu, target_cpu);
- return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+ return wakeup_cfs_migrater(curr_cpu, migration_cpu_stop, &arg);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 932f63baeb82..b7a155e05c98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11960,6 +11960,19 @@ static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void
cfs_migration_queue_work(cpu, work_buf);
}
+bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
+
+ cpu_stop_init_done(&done, 1);
+ cfs_migration_queue_work(cpu, &work);
+ cond_resched();
+ wait_for_completion(&done.completion);
+
+ return done.ret;
+}
+
static int cfs_migration_should_run(unsigned int cpu)
{
struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a00fc7057d97..7b242c18a6d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3055,6 +3055,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
return true;
}
+
+bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
#endif
extern void swake_up_all_locked(struct swait_queue_head *q);
--
2.17.1
The active load balance has a known issue[1][2] that there is a race
window between waking up the migration thread on the busiest CPU and it
begins to preempt the current running CFS task. This race window may cause
unexpected behavior that the current running CFS task may be preempted
by a RT task first, and then the RT task will be preempted by this
waked migration thread. Per our tracing, the latency caused by this
preemption can be greater than 1ms, which is not a small latency for the
RT tasks.
Now as we have introduced cfs_migration, we can assign it to the active
load balance work. Then after we set latency sensitive RT tasks to a higher
priority, it can't be preempted again.
[1]. https://lore.kernel.org/lkml/CAKfTPtBygNcVewbb0GQOP5xxO96am3YeTZNP5dK9BxKHJJAL-g@mail.gmail.com/
[2]. https://lore.kernel.org/lkml/[email protected]/
Signed-off-by: Yafang Shao <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
---
kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++++++++--
1 file changed, 35 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b3fa91828b..932f63baeb82 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9807,6 +9807,8 @@ static int need_active_balance(struct lb_env *env)
}
static int active_load_balance_cpu_stop(void *data);
+static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf);
static int should_we_balance(struct lb_env *env)
{
@@ -10049,7 +10051,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
raw_spin_rq_unlock_irqrestore(busiest, flags);
if (active_balance) {
- stop_one_cpu_nowait(cpu_of(busiest),
+ wakeup_cfs_migrater_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
@@ -10147,7 +10149,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}
/*
- * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CFS migrater. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
@@ -11927,6 +11929,37 @@ struct cfs_migrater {
DEFINE_PER_CPU(struct cfs_migrater, cfs_migrater);
+static void __cfs_migration_queue_work(struct cfs_migrater *migrater,
+ struct cpu_stop_work *work,
+ struct wake_q_head *wakeq)
+{
+ list_add_tail(&work->list, &migrater->works);
+ wake_q_add(wakeq, migrater->thread);
+}
+
+/* queue @work to @migrater. if offline, @work is completed immediately */
+static void cfs_migration_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
+ DEFINE_WAKE_Q(wakeq);
+ unsigned long flags;
+
+ preempt_disable();
+ raw_spin_lock_irqsave(&migrater->lock, flags);
+ __cfs_migration_queue_work(migrater, work, &wakeq);
+ raw_spin_unlock_irqrestore(&migrater->lock, flags);
+
+ wake_up_q(&wakeq);
+ preempt_enable();
+}
+
+static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf)
+{
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
+ cfs_migration_queue_work(cpu, work_buf);
+}
+
static int cfs_migration_should_run(unsigned int cpu)
{
struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
--
2.17.1
On 04/11/21 14:57, Yafang Shao wrote:
> A new per-cpu kthread named "cfs_migration/N" is introduced to do
> cfs specific balance works. It is a FIFO task with priority FIFO-1,
> which means it can preempt any cfs tasks but can't preempt other FIFO
> tasks. The kthreads as follows,
>
> PID COMMAND
> 13 [cfs_migration/0]
> 20 [cfs_migration/1]
> 25 [cfs_migration/2]
> 32 [cfs_migration/3]
> 38 [cfs_migration/4]
> ...
>
> $ cat /proc/13/sched
> ...
> policy : 1
> prio : 98
> ...
>
> $ cat /proc/13/status
> ...
> Cpus_allowed: 0001
> Cpus_allowed_list: 0
> ...
>
> All the works need to be done will be queued into a singly linked list,
> in which the first queued will be first serviced.
>
> Signed-off-by: Yafang Shao <[email protected]>
> Cc: Valentin Schneider <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> ---
> kernel/sched/fair.c | 93 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 93 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87db481e8a56..56b3fa91828b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -20,6 +20,8 @@
> * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> */
> +#include <linux/smpboot.h>
> +#include <linux/stop_machine.h>
> #include "sched.h"
>
> /*
> @@ -11915,3 +11917,94 @@ int sched_trace_rq_nr_running(struct rq *rq)
> return rq ? rq->nr_running : -1;
> }
> EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
> +
> +#ifdef CONFIG_SMP
> +struct cfs_migrater {
> + struct task_struct *thread;
> + struct list_head works;
> + raw_spinlock_t lock;
Hm so the handler of that work queue will now be a SCHED_FIFO task (which
can block) rather than a CPU stopper (which can't), but AFAICT the
callsites that would enqueue an item can't block, so having this as a
raw_spinlock_t should still make sense.
> +};
> +
> +DEFINE_PER_CPU(struct cfs_migrater, cfs_migrater);
> +
> +static int cfs_migration_should_run(unsigned int cpu)
> +{
> + struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> + unsigned long flags;
> + int run;
> +
> + raw_spin_lock_irqsave(&migrater->lock, flags);
> + run = !list_empty(&migrater->works);
> + raw_spin_unlock_irqrestore(&migrater->lock, flags);
> +
> + return run;
> +}
> +
> +static void cfs_migration_setup(unsigned int cpu)
> +{
> + /* cfs_migration should have a higher priority than normal tasks,
> + * but a lower priority than other FIFO tasks.
> + */
> + sched_set_fifo_low(current);
> +}
> +
> +static void cfs_migrater_thread(unsigned int cpu)
> +{
> + struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> + struct cpu_stop_work *work;
> +
> +repeat:
> + work = NULL;
> + raw_spin_lock_irq(&migrater->lock);
> + if (!list_empty(&migrater->works)) {
> + work = list_first_entry(&migrater->works,
> + struct cpu_stop_work, list);
> + list_del_init(&work->list);
> + }
> + raw_spin_unlock_irq(&migrater->lock);
> +
> + if (work) {
> + struct cpu_stop_done *done = work->done;
> + cpu_stop_fn_t fn = work->fn;
> + void *arg = work->arg;
> + int ret;
> +
> + preempt_count_inc();
> + ret = fn(arg);
> + if (done) {
> + if (ret)
> + done->ret = ret;
> + cpu_stop_signal_done(done);
> + }
> + preempt_count_dec();
> + goto repeat;
> + }
> +}
You're pretty much copying the CPU stopper setup, but that seems overkill
for the functionality we're after: migrate a CFS task from one CPU to
another. This shouldn't need to be able to run any arbitrary callback
function.
Unfortunately you are tackling both CFS active balancing and NUMA balancing
at the same time, and right now they're plumbed a bit differently which
probably drove you to use something a bit for polymorphic. Ideally we
should be making them use the same paths, but IMO it would be acceptable as
a first step to just cater to CFS active balancing - folks that really care
about their RT tasks can disable CONFIG_NUMA_BALANCING, but there is
nothing to disable CFS active balancing.
Now, I'm thinking the bare information we need is:
- a task to migrate
- a CPU to move it to
And then you can do something like...
trigger_migration(task_struct *p, unsigned int dst_cpu)
{
work = { p, dst_cpu };
get_task_struct(p);
/* queue work + wake migrater + wait for completion */
}
cfs_migrater_thread()
{
/* ... */
p = work->p;
if (task_rq(p) != this_rq())
goto out;
/* migrate task to work->dst_cpu */
out:
complete(<some completion struct>);
put_task_struct(p);
}
We should also probably add something to prevent the migration from
happening after it is no longer relevant. Say if we have something like:
<queue work to migrate p from CPU0 to CPU1>
<FIFO-2 task runs for 42 seconds on CPU0>
<cfs_migration/0 now gets to run>
p could have moved elsewhere while cfs_migration/0. I'm thinking source CPU
could be a useful information, but that doesn't tell you if the task moved
around in the meantime...
WDYT?
On 04/11/21 14:57, Yafang Shao wrote:
> Similar to active load balance, the numa balance work is also applied to
> cfs tasks only and it should't preempt other FIFO tasks. We'd better assign
> cfs_migration to the numa balance as well.
>
> Signed-off-by: Yafang Shao <[email protected]>
> Cc: Valentin Schneider <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> ---
> kernel/sched/core.c | 2 +-
> kernel/sched/fair.c | 13 +++++++++++++
> kernel/sched/sched.h | 2 ++
> 3 files changed, 16 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9cb81ef8acc8..4a37b06715f4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8724,7 +8724,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
> /* TODO: This is not properly updating schedstats */
>
> trace_sched_move_numa(p, curr_cpu, target_cpu);
> - return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
> + return wakeup_cfs_migrater(curr_cpu, migration_cpu_stop, &arg);
So that one I find really icky - migration_cpu_stop() really is meant to be
run from a CPU stopper (cf. cpu_stop suffix). IMO this is the opportunity
to make NUMA balancing reuse the logic for CFS active balance here, but per
previous email I'd say it could be done as a second step.
> }
>
> /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 932f63baeb82..b7a155e05c98 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11960,6 +11960,19 @@ static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void
> cfs_migration_queue_work(cpu, work_buf);
> }
>
> +bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
> +{
> + struct cpu_stop_done done;
> + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
> +
> + cpu_stop_init_done(&done, 1);
> + cfs_migration_queue_work(cpu, &work);
> + cond_resched();
> + wait_for_completion(&done.completion);
> +
> + return done.ret;
> +}
> +
> static int cfs_migration_should_run(unsigned int cpu)
> {
> struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..7b242c18a6d8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3055,6 +3055,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
>
> return true;
> }
> +
> +bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
> #endif
>
> extern void swake_up_all_locked(struct swait_queue_head *q);
> --
> 2.17.1
Hi,
On 04/11/21 14:57, Yafang Shao wrote:
> The active load balance has a known issue[1][2] that there is a race
> window between waking up the migration thread on the busiest CPU and it
> begins to preempt the current running CFS task. This race window may cause
> unexpected behavior that the current running CFS task may be preempted
> by a RT task first, and then the RT task will be preempted by this
> waked migration thread. Per our tracing, the latency caused by this
> preemption can be greater than 1ms, which is not a small latency for the
> RT tasks.
>
> We'd better set a proper priority to this balance work so that it can
> preempt CFS task only. A new per-cpu thread cfs_migration is introduced
> for this purpose. The cfs_migration thread has a priority FIFO-1,
> which means it can preempt any cfs tasks but can't preempt other FIFO
> tasks.
>
> Besides the active load balance work, the numa balance work also applies
> to CFS tasks only. So we'd better assign cfs_migraion to numa balance
> work as well.
>
> [1]. https://lore.kernel.org/lkml/CAKfTPtBygNcVewbb0GQOP5xxO96am3YeTZNP5dK9BxKHJJAL-g@mail.gmail.com/
> [2]. https://lore.kernel.org/lkml/[email protected]/
>
So overall I quite like the idea, but am not entirely convinced by the
implementation. See comments in rest of the thread - in any case, thanks
for taking a jab at that!
> Yafang Shao (4):
> stop_machine: Move cpu_stop_done into stop_machine.h
> sched/fair: Introduce cfs_migration
> sched/fair: Do active load balance in cfs_migration
> sched/core: Do numa balance in cfs_migration
>
> include/linux/stop_machine.h | 12 +++
> kernel/sched/core.c | 2 +-
> kernel/sched/fair.c | 143 ++++++++++++++++++++++++++++++++++-
> kernel/sched/sched.h | 2 +
> kernel/stop_machine.c | 14 +---
> 5 files changed, 158 insertions(+), 15 deletions(-)
>
> --
> 2.17.1
On 04/11/21 14:57, Yafang Shao wrote:
> Move struct cpu_stop_done into the stop_machine.h, then it can be resued
> by the functions outside of stop_maichine.c.
>
The nr_todo & ret faff is only required for multi_stop scenarios, which
isn't your use-case. AFAICT you don't need to export this, you just need a
struct completion somewhere (and maybe a ret, but I'm not even sure about
that).
On Sat, Nov 6, 2021 at 1:02 AM Valentin Schneider
<[email protected]> wrote:
>
> On 04/11/21 14:57, Yafang Shao wrote:
> > Similar to active load balance, the numa balance work is also applied to
> > cfs tasks only and it should't preempt other FIFO tasks. We'd better assign
> > cfs_migration to the numa balance as well.
> >
> > Signed-off-by: Yafang Shao <[email protected]>
> > Cc: Valentin Schneider <[email protected]>
> > Cc: Vincent Guittot <[email protected]>
> > Cc: Dietmar Eggemann <[email protected]>
> > ---
> > kernel/sched/core.c | 2 +-
> > kernel/sched/fair.c | 13 +++++++++++++
> > kernel/sched/sched.h | 2 ++
> > 3 files changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 9cb81ef8acc8..4a37b06715f4 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -8724,7 +8724,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
> > /* TODO: This is not properly updating schedstats */
> >
> > trace_sched_move_numa(p, curr_cpu, target_cpu);
> > - return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
> > + return wakeup_cfs_migrater(curr_cpu, migration_cpu_stop, &arg);
>
> So that one I find really icky - migration_cpu_stop() really is meant to be
> run from a CPU stopper (cf. cpu_stop suffix). IMO this is the opportunity
> to make NUMA balancing reuse the logic for CFS active balance here, but per
> previous email I'd say it could be done as a second step.
>
Sure.
> > }
> >
> > /*
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 932f63baeb82..b7a155e05c98 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -11960,6 +11960,19 @@ static void wakeup_cfs_migrater_nowait(unsigned int cpu, cpu_stop_fn_t fn, void
> > cfs_migration_queue_work(cpu, work_buf);
> > }
> >
> > +bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
> > +{
> > + struct cpu_stop_done done;
> > + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
> > +
> > + cpu_stop_init_done(&done, 1);
> > + cfs_migration_queue_work(cpu, &work);
> > + cond_resched();
> > + wait_for_completion(&done.completion);
> > +
> > + return done.ret;
> > +}
> > +
> > static int cfs_migration_should_run(unsigned int cpu)
> > {
> > struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index a00fc7057d97..7b242c18a6d8 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3055,6 +3055,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
> >
> > return true;
> > }
> > +
> > +bool wakeup_cfs_migrater(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
> > #endif
> >
> > extern void swake_up_all_locked(struct swait_queue_head *q);
> > --
> > 2.17.1
--
Thanks
Yafang
On Sat, Nov 6, 2021 at 1:01 AM Valentin Schneider
<[email protected]> wrote:
>
> On 04/11/21 14:57, Yafang Shao wrote:
> > A new per-cpu kthread named "cfs_migration/N" is introduced to do
> > cfs specific balance works. It is a FIFO task with priority FIFO-1,
> > which means it can preempt any cfs tasks but can't preempt other FIFO
> > tasks. The kthreads as follows,
> >
> > PID COMMAND
> > 13 [cfs_migration/0]
> > 20 [cfs_migration/1]
> > 25 [cfs_migration/2]
> > 32 [cfs_migration/3]
> > 38 [cfs_migration/4]
> > ...
> >
> > $ cat /proc/13/sched
> > ...
> > policy : 1
> > prio : 98
> > ...
> >
> > $ cat /proc/13/status
> > ...
> > Cpus_allowed: 0001
> > Cpus_allowed_list: 0
> > ...
> >
> > All the works need to be done will be queued into a singly linked list,
> > in which the first queued will be first serviced.
> >
> > Signed-off-by: Yafang Shao <[email protected]>
> > Cc: Valentin Schneider <[email protected]>
> > Cc: Vincent Guittot <[email protected]>
> > Cc: Dietmar Eggemann <[email protected]>
> > ---
> > kernel/sched/fair.c | 93 +++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 93 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 87db481e8a56..56b3fa91828b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -20,6 +20,8 @@
> > * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> > * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> > */
> > +#include <linux/smpboot.h>
> > +#include <linux/stop_machine.h>
> > #include "sched.h"
> >
> > /*
> > @@ -11915,3 +11917,94 @@ int sched_trace_rq_nr_running(struct rq *rq)
> > return rq ? rq->nr_running : -1;
> > }
> > EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
> > +
> > +#ifdef CONFIG_SMP
> > +struct cfs_migrater {
> > + struct task_struct *thread;
> > + struct list_head works;
> > + raw_spinlock_t lock;
>
> Hm so the handler of that work queue will now be a SCHED_FIFO task (which
> can block) rather than a CPU stopper (which can't), but AFAICT the
> callsites that would enqueue an item can't block, so having this as a
> raw_spinlock_t should still make sense.
>
> > +};
> > +
> > +DEFINE_PER_CPU(struct cfs_migrater, cfs_migrater);
> > +
> > +static int cfs_migration_should_run(unsigned int cpu)
> > +{
> > + struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> > + unsigned long flags;
> > + int run;
> > +
> > + raw_spin_lock_irqsave(&migrater->lock, flags);
> > + run = !list_empty(&migrater->works);
> > + raw_spin_unlock_irqrestore(&migrater->lock, flags);
> > +
> > + return run;
> > +}
> > +
> > +static void cfs_migration_setup(unsigned int cpu)
> > +{
> > + /* cfs_migration should have a higher priority than normal tasks,
> > + * but a lower priority than other FIFO tasks.
> > + */
> > + sched_set_fifo_low(current);
> > +}
> > +
> > +static void cfs_migrater_thread(unsigned int cpu)
> > +{
> > + struct cfs_migrater *migrater = &per_cpu(cfs_migrater, cpu);
> > + struct cpu_stop_work *work;
> > +
> > +repeat:
> > + work = NULL;
> > + raw_spin_lock_irq(&migrater->lock);
> > + if (!list_empty(&migrater->works)) {
> > + work = list_first_entry(&migrater->works,
> > + struct cpu_stop_work, list);
> > + list_del_init(&work->list);
> > + }
> > + raw_spin_unlock_irq(&migrater->lock);
> > +
> > + if (work) {
> > + struct cpu_stop_done *done = work->done;
> > + cpu_stop_fn_t fn = work->fn;
> > + void *arg = work->arg;
> > + int ret;
> > +
> > + preempt_count_inc();
> > + ret = fn(arg);
> > + if (done) {
> > + if (ret)
> > + done->ret = ret;
> > + cpu_stop_signal_done(done);
> > + }
> > + preempt_count_dec();
> > + goto repeat;
> > + }
> > +}
>
> You're pretty much copying the CPU stopper setup, but that seems overkill
> for the functionality we're after: migrate a CFS task from one CPU to
> another. This shouldn't need to be able to run any arbitrary callback
> function.
>
> Unfortunately you are tackling both CFS active balancing and NUMA balancing
> at the same time, and right now they're plumbed a bit differently which
> probably drove you to use something a bit for polymorphic. Ideally we
> should be making them use the same paths, but IMO it would be acceptable as
> a first step to just cater to CFS active balancing - folks that really care
> about their RT tasks can disable CONFIG_NUMA_BALANCING, but there is
> nothing to disable CFS active balancing.
>
Right. The code will be more simplified if we only care about CFS
active balancing in this patchset.
We have disabled the numa balancing through
/proc/sys/kernel/numa_balancing, so it is not a critical issue now.
>
> Now, I'm thinking the bare information we need is:
>
> - a task to migrate
> - a CPU to move it to
>
> And then you can do something like...
>
> trigger_migration(task_struct *p, unsigned int dst_cpu)
> {
> work = { p, dst_cpu };
> get_task_struct(p);
> /* queue work + wake migrater + wait for completion */
> }
>
> cfs_migrater_thread()
> {
> /* ... */
> p = work->p;
>
> if (task_rq(p) != this_rq())
> goto out;
>
> /* migrate task to work->dst_cpu */
> out:
> complete(<some completion struct>);
> put_task_struct(p);
> }
>
Agreed.
>
> We should also probably add something to prevent the migration from
> happening after it is no longer relevant. Say if we have something like:
>
> <queue work to migrate p from CPU0 to CPU1>
> <FIFO-2 task runs for 42 seconds on CPU0>
> <cfs_migration/0 now gets to run>
>
> p could have moved elsewhere while cfs_migration/0. I'm thinking source CPU
> could be a useful information, but that doesn't tell you if the task moved
> around in the meantime...
>
> WDYT?
Agreed.
It seems we'd better take the patch[1] I sent several weeks back.
[1]. https://lore.kernel.org/lkml/[email protected]/
--
Thanks
Yafang
On Sat, Nov 6, 2021 at 1:01 AM Valentin Schneider
<[email protected]> wrote:
>
> On 04/11/21 14:57, Yafang Shao wrote:
> > Move struct cpu_stop_done into the stop_machine.h, then it can be resued
> > by the functions outside of stop_maichine.c.
> >
>
> The nr_todo & ret faff is only required for multi_stop scenarios, which
> isn't your use-case. AFAICT you don't need to export this, you just need a
> struct completion somewhere (and maybe a ret, but I'm not even sure about
> that).
Right. nr_todo is for stopping two and more cpus. We really don't
need it if we only support stopping one cpu.
I will change it.
--
Thanks
Yafang
Greeting,
FYI, we noticed the following commit (built with gcc-9):
commit: 64228563c20f024e40e4bdaa51eeec99002c489f ("[RFC PATCH 2/4] sched/fair: Introduce cfs_migration")
url: https://github.com/0day-ci/linux/commits/Yafang-Shao/sched-Introduce-cfs_migration/20211104-225939
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 8ea9183db4ad8afbcb7089a77c23eaf965b0cacd
patch link: https://lore.kernel.org/lkml/[email protected]
in testcase: leaking-addresses
version: leaking-addresses-x86_64-cf2a85e-1_20211103
with following parameters:
ucode: 0x28
on test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz with 16G memory
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+--------------------------------------------------+------------+------------+
| | 812ea7cfb1 | 64228563c2 |
+--------------------------------------------------+------------+------------+
| boot_successes | 10 | 0 |
| boot_failures | 0 | 11 |
| WARNING:at_kernel/kthread.c:#__kthread_bind_mask | 0 | 11 |
| RIP:__kthread_bind_mask | 0 | 11 |
+--------------------------------------------------+------------+------------+
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
[ 3.072411][ T1] WARNING: CPU: 0 PID: 1 at kernel/kthread.c:465 __kthread_bind_mask (kernel/kthread.c:465 (discriminator 1))
[ 3.073411][ T1] Modules linked in:
[ 3.074411][ T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc4-00071-g64228563c20f #1
[ 3.075411][ T1] Hardware name: Dell Inc. OptiPlex 9020/0DNKMN, BIOS A05 12/05/2013
[ 3.076411][ T1] RIP: 0010:__kthread_bind_mask (kernel/kthread.c:465 (discriminator 1))
[ 3.077411][ T1] Code: 89 e2 b7 00 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 41 54 55 48 89 f5 89 d6 53 48 89 fb e8 28 bb 01 00 48 85 c0 75 09 <0f> 0b 5b 5d 41 5c 41 5d c3 4c 8d ab a4 0c 00 00 4c 89 ef e8 4b ed
All code
========
0: 89 e2 mov %esp,%edx
2: b7 00 mov $0x0,%bh
4: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
b: 00 00
d: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
12: 41 55 push %r13
14: 41 54 push %r12
16: 55 push %rbp
17: 48 89 f5 mov %rsi,%rbp
1a: 89 d6 mov %edx,%esi
1c: 53 push %rbx
1d: 48 89 fb mov %rdi,%rbx
20: e8 28 bb 01 00 callq 0x1bb4d
25: 48 85 c0 test %rax,%rax
28: 75 09 jne 0x33
2a:* 0f 0b ud2 <-- trapping instruction
2c: 5b pop %rbx
2d: 5d pop %rbp
2e: 41 5c pop %r12
30: 41 5d pop %r13
32: c3 retq
33: 4c 8d ab a4 0c 00 00 lea 0xca4(%rbx),%r13
3a: 4c 89 ef mov %r13,%rdi
3d: e8 .byte 0xe8
3e: 4b ed rex.WXB in (%dx),%eax
Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 5b pop %rbx
3: 5d pop %rbp
4: 41 5c pop %r12
6: 41 5d pop %r13
8: c3 retq
9: 4c 8d ab a4 0c 00 00 lea 0xca4(%rbx),%r13
10: 4c 89 ef mov %r13,%rdi
13: e8 .byte 0xe8
14: 4b ed rex.WXB in (%dx),%eax
[ 3.078411][ T1] RSP: 0000:ffffc9000002fdd8 EFLAGS: 00010246
[ 3.079411][ T1] RAX: 0000000000000000 RBX: ffff888100aed000 RCX: 0000000000000000
[ 3.080411][ T1] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 00000000ffffffff
[ 3.081411][ T1] RBP: ffffffff822101e0 R08: ffffffff8284cec8 R09: ffffffff8284cec8
[ 3.082411][ T1] R10: 0000000000000000 R11: ffff8883fda278f0 R12: 0000000000000008
[ 3.083411][ T1] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 3.084411][ T1] FS: 0000000000000000(0000) GS:ffff8883fda00000(0000) knlGS:0000000000000000
[ 3.085411][ T1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3.086411][ T1] CR2: ffff88841ea01000 CR3: 000000041da10001 CR4: 00000000001706f0
[ 3.087411][ T1] Call Trace:
[ 3.088413][ T1] kthread_unpark (kernel/kthread.c:478 kernel/kthread.c:570)
[ 3.089411][ T1] cfs_migration_init (kernel/sched/fair.c:12029 (discriminator 3))
[ 3.090412][ T1] ? setup_sched_thermal_decay_shift (kernel/sched/fair.c:12014)
[ 3.091411][ T1] do_one_initcall (init/main.c:1303)
[ 3.092412][ T1] kernel_init_freeable (init/main.c:1419 init/main.c:1603)
[ 3.093412][ T1] ? rest_init (init/main.c:1497)
[ 3.094411][ T1] kernel_init (init/main.c:1507)
[ 3.095411][ T1] ret_from_fork (arch/x86/entry/entry_64.S:301)
[ 3.096413][ T1] ---[ end trace 221e592f8f64f075 ]---
[ 3.097411][ T1] rcu: Hierarchical SRCU implementation.
[ 3.098989][ T5] NMI watchdog: Enabled. Permanently consumes one hw-PMU counter.
[ 3.099473][ T1] smp: Bringing up secondary CPUs ...
[ 3.100482][ T1] x86: Booting SMP configuration:
[ 3.101412][ T1] .... node #0, CPUs: #1
[ 0.534696][ T0] masked ExtINT on CPU#1
[ 3.110490][ T1] #2
[ 0.534696][ T0] masked ExtINT on CPU#2
[ 3.117413][ T1] #3
[ 0.534696][ T0] masked ExtINT on CPU#3
[ 3.124184][ T1] #4
[ 0.534696][ T0] masked ExtINT on CPU#4
[ 3.130933][ T1] MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.
[ 3.131497][ T1] #5
[ 0.534696][ T0] masked ExtINT on CPU#5
[ 3.138269][ T1] #6
[ 0.534696][ T0] masked ExtINT on CPU#6
[ 3.145166][ T1] #7
[ 0.534696][ T0] masked ExtINT on CPU#7
[ 3.152059][ T1] smp: Brought up 1 node, 8 CPUs
[ 3.152412][ T1] smpboot: Max logical packages: 1
[ 3.153411][ T1] smpboot: Total of 8 processors activated (54273.93 BogoMIPS)
[ 3.177958][ T60] node 0 deferred pages initialised in 22ms
[ 3.184695][ T1] devtmpfs: initialized
[ 3.185443][ T1] x86/mm: Memory block size: 128MB
[ 3.187216][ T1] ACPI: PM: Registering ACPI NVS region [mem 0xd1695000-0xd169bfff] (28672 bytes)
[ 3.187412][ T1] ACPI: PM: Registering ACPI NVS region [mem 0xda71c000-0xda7fffff] (933888 bytes)
[ 3.188450][ T1] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275000 ns
[ 3.189413][ T1] futex hash table entries: 2048 (order: 5, 131072 bytes, linear)
[ 3.190454][ T1] pinctrl core: initialized pinctrl subsystem
[ 3.191527][ T1] NET: Registered PF_NETLINK/PF_ROUTE protocol family
[ 3.192555][ T1] audit: initializing netlink subsys (disabled)
[ 3.193423][ T71] audit: type=2000 audit(1636161192.177:1): state=initialized audit_enabled=0 res=1
[ 3.193471][ T1] thermal_sys: Registered thermal governor 'fair_share'
[ 3.194412][ T1] thermal_sys: Registered thermal governor 'bang_bang'
[ 3.195411][ T1] thermal_sys: Registered thermal governor 'step_wise'
[ 3.196411][ T1] thermal_sys: Registered thermal governor 'user_space'
[ 3.197420][ T1] cpuidle: using governor menu
[ 3.199511][ T1] ACPI: bus type PCI registered
[ 3.200412][ T1] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
[ 3.201454][ T1] PCI: MMCONFIG for domain 0000 [bus 00-3f] at [mem 0xf8000000-0xfbffffff] (base 0xf8000000)
[ 3.202412][ T1] PCI: MMCONFIG at [mem 0xf8000000-0xfbffffff] reserved in E820
[ 3.203416][ T1] pmd_set_huge: Cannot satisfy [mem 0xf8000000-0xf8200000] with a huge-page mapping due to MTRR override.
[ 3.204444][ T1] PCI: Using configuration type 1 for base access
[ 3.205522][ T1] core: PMU erratum BJ122, BV98, HSD29 worked around, HT is on
[ 3.207568][ T1] Kprobes globally optimized
[ 3.208425][ T1] HugeTLB registered 1.00 GiB page size, pre-allocated 0 pages
[ 3.209412][ T1] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages
[ 3.210428][ T1] cryptd: max_cpu_qlen set to 1000
[ 3.211440][ T1] ACPI: Added _OSI(Module Device)
[ 3.212415][ T1] ACPI: Added _OSI(Processor Device)
[ 3.213411][ T1] ACPI: Added _OSI(3.0 _SCP Extensions)
[ 3.214411][ T1] ACPI: Added _OSI(Processor Aggregator Device)
[ 3.215411][ T1] ACPI: Added _OSI(Linux-Dell-Video)
[ 3.216411][ T1] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
[ 3.217411][ T1] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
[ 3.224227][ T1] ACPI: 6 ACPI AML tables successfully acquired and loaded
[ 3.225178][ T1] ACPI: [Firmware Bug]: BIOS _OSI(Linux) query ignored
[ 3.225803][ T1] ACPI: Dynamic OEM Table Load:
[ 3.226414][ T1] ACPI: SSDT 0xFFFF88841EAB5000 0003D3 (v01 PmRef Cpu0Cst 00003001 INTL 20120711)
[ 3.227913][ T1] ACPI: Dynamic OEM Table Load:
[ 3.228414][ T1] ACPI: SSDT 0xFFFF88841EAC4000 0005AA (v01 PmRef ApIst 00003000 INTL 20120711)
[ 3.229916][ T1] ACPI: Dynamic OEM Table Load:
[ 3.230413][ T1] ACPI: SSDT 0xFFFF888100EF8800 000119 (v01 PmRef ApCst 00003000 INTL 20120711)
[ 3.232725][ T1] ACPI: Interpreter enabled
[ 3.233432][ T1] ACPI: PM: (supports S0 S3 S4 S5)
[ 3.234411][ T1] ACPI: Using IOAPIC for interrupt routing
[ 3.235430][ T1] PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug
[ 3.236560][ T1] ACPI: Enabled 9 GPEs in block 00 to 3F
[ 3.243366][ T1] ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-3e])
[ 3.243414][ T1] acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI HPX-Type3]
[ 3.244781][ T1] acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug SHPCHotplug PME AER PCIeCapability LTR]
[ 3.245679][ T1] PCI host bridge to bus 0000:00
[ 3.246412][ T1] pci_bus 0000:00: root bus resource [io 0x0000-0x0cf7 window]
[ 3.247411][ T1] pci_bus 0000:00: root bus resource [io 0x0d00-0xffff window]
[ 3.248411][ T1] pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000bffff window]
[ 3.249411][ T1] pci_bus 0000:00: root bus resource [mem 0x000d4000-0x000d7fff window]
[ 3.250411][ T1] pci_bus 0000:00: root bus resource [mem 0x000d8000-0x000dbfff window]
[ 3.251411][ T1] pci_bus 0000:00: root bus resource [mem 0x000dc000-0x000dffff window]
[ 3.252411][ T1] pci_bus 0000:00: root bus resource [mem 0x000e0000-0x000e3fff window]
[ 3.253411][ T1] pci_bus 0000:00: root bus resource [mem 0x000e4000-0x000e7fff window]
[ 3.254411][ T1] pci_bus 0000:00: root bus resource [mem 0xdf200000-0xfeafffff window]
[ 3.255411][ T1] pci_bus 0000:00: root bus resource [bus 00-3e]
[ 3.256436][ T1] pci 0000:00:00.0: [8086:0c00] type 00 class 0x060000
[ 3.257508][ T1] pci 0000:00:02.0: [8086:0412] type 00 class 0x030000
[ 3.258416][ T1] pci 0000:00:02.0: reg 0x10: [mem 0xf7800000-0xf7bfffff 64bit]
[ 3.259414][ T1] pci 0000:00:02.0: reg 0x18: [mem 0xe0000000-0xefffffff 64bit pref]
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
---
0DAY/LKP+ Test Infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
Thanks,
Oliver Sang
On Sun, Nov 7, 2021 at 5:38 PM kernel test robot <[email protected]> wrote:
>
>
>
> Greeting,
>
> FYI, we noticed the following commit (built with gcc-9):
>
> commit: 64228563c20f024e40e4bdaa51eeec99002c489f ("[RFC PATCH 2/4] sched/fair: Introduce cfs_migration")
> url: https://github.com/0day-ci/linux/commits/Yafang-Shao/sched-Introduce-cfs_migration/20211104-225939
> base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 8ea9183db4ad8afbcb7089a77c23eaf965b0cacd
> patch link: https://lore.kernel.org/lkml/[email protected]
>
> in testcase: leaking-addresses
> version: leaking-addresses-x86_64-cf2a85e-1_20211103
> with following parameters:
>
> ucode: 0x28
>
>
>
> on test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz with 16G memory
>
> caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
>
>
> +--------------------------------------------------+------------+------------+
> | | 812ea7cfb1 | 64228563c2 |
> +--------------------------------------------------+------------+------------+
> | boot_successes | 10 | 0 |
> | boot_failures | 0 | 11 |
> | WARNING:at_kernel/kthread.c:#__kthread_bind_mask | 0 | 11 |
> | RIP:__kthread_bind_mask | 0 | 11 |
> +--------------------------------------------------+------------+------------+
>
>
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot <[email protected]>
>
Thanks for the report!
Should make below change:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a906412570e..fa720e840c6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12037,7 +12037,6 @@ static struct smp_hotplug_thread
cfs_migration_threads = {
static int __init cfs_migration_init(void)
{
- struct cfs_migrater *cm = &per_cpu(cfs_migrater,
raw_smp_processor_id());
unsigned int cpu;
for_each_possible_cpu(cpu) {
@@ -12048,7 +12047,6 @@ static int __init cfs_migration_init(void)
}
WARN_ON_ONCE(smpboot_register_percpu_thread(&cfs_migration_threads));
- kthread_unpark(cm->thread);
return 0;
}
>
> [ 3.072411][ T1] WARNING: CPU: 0 PID: 1 at kernel/kthread.c:465 __kthread_bind_mask (kernel/kthread.c:465 (discriminator 1))
> [ 3.073411][ T1] Modules linked in:
> [ 3.074411][ T1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc4-00071-g64228563c20f #1
> [ 3.075411][ T1] Hardware name: Dell Inc. OptiPlex 9020/0DNKMN, BIOS A05 12/05/2013
> [ 3.076411][ T1] RIP: 0010:__kthread_bind_mask (kernel/kthread.c:465 (discriminator 1))
> [ 3.077411][ T1] Code: 89 e2 b7 00 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 41 54 55 48 89 f5 89 d6 53 48 89 fb e8 28 bb 01 00 48 85 c0 75 09 <0f> 0b 5b 5d 41 5c 41 5d c3 4c 8d ab a4 0c 00 00 4c 89 ef e8 4b ed
> All code
> ========
> 0: 89 e2 mov %esp,%edx
> 2: b7 00 mov $0x0,%bh
> 4: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
> b: 00 00
> d: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
> 12: 41 55 push %r13
> 14: 41 54 push %r12
> 16: 55 push %rbp
> 17: 48 89 f5 mov %rsi,%rbp
> 1a: 89 d6 mov %edx,%esi
> 1c: 53 push %rbx
> 1d: 48 89 fb mov %rdi,%rbx
> 20: e8 28 bb 01 00 callq 0x1bb4d
> 25: 48 85 c0 test %rax,%rax
> 28: 75 09 jne 0x33
> 2a:* 0f 0b ud2 <-- trapping instruction
> 2c: 5b pop %rbx
> 2d: 5d pop %rbp
> 2e: 41 5c pop %r12
> 30: 41 5d pop %r13
> 32: c3 retq
> 33: 4c 8d ab a4 0c 00 00 lea 0xca4(%rbx),%r13
> 3a: 4c 89 ef mov %r13,%rdi
> 3d: e8 .byte 0xe8
> 3e: 4b ed rex.WXB in (%dx),%eax
>
> Code starting with the faulting instruction
> ===========================================
> 0: 0f 0b ud2
> 2: 5b pop %rbx
> 3: 5d pop %rbp
> 4: 41 5c pop %r12
> 6: 41 5d pop %r13
> 8: c3 retq
> 9: 4c 8d ab a4 0c 00 00 lea 0xca4(%rbx),%r13
> 10: 4c 89 ef mov %r13,%rdi
> 13: e8 .byte 0xe8
> 14: 4b ed rex.WXB in (%dx),%eax
> [ 3.078411][ T1] RSP: 0000:ffffc9000002fdd8 EFLAGS: 00010246
> [ 3.079411][ T1] RAX: 0000000000000000 RBX: ffff888100aed000 RCX: 0000000000000000
> [ 3.080411][ T1] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 00000000ffffffff
> [ 3.081411][ T1] RBP: ffffffff822101e0 R08: ffffffff8284cec8 R09: ffffffff8284cec8
> [ 3.082411][ T1] R10: 0000000000000000 R11: ffff8883fda278f0 R12: 0000000000000008
> [ 3.083411][ T1] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
> [ 3.084411][ T1] FS: 0000000000000000(0000) GS:ffff8883fda00000(0000) knlGS:0000000000000000
> [ 3.085411][ T1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 3.086411][ T1] CR2: ffff88841ea01000 CR3: 000000041da10001 CR4: 00000000001706f0
> [ 3.087411][ T1] Call Trace:
> [ 3.088413][ T1] kthread_unpark (kernel/kthread.c:478 kernel/kthread.c:570)
> [ 3.089411][ T1] cfs_migration_init (kernel/sched/fair.c:12029 (discriminator 3))
> [ 3.090412][ T1] ? setup_sched_thermal_decay_shift (kernel/sched/fair.c:12014)
> [ 3.091411][ T1] do_one_initcall (init/main.c:1303)
> [ 3.092412][ T1] kernel_init_freeable (init/main.c:1419 init/main.c:1603)
> [ 3.093412][ T1] ? rest_init (init/main.c:1497)
> [ 3.094411][ T1] kernel_init (init/main.c:1507)
> [ 3.095411][ T1] ret_from_fork (arch/x86/entry/entry_64.S:301)
> [ 3.096413][ T1] ---[ end trace 221e592f8f64f075 ]---
> [ 3.097411][ T1] rcu: Hierarchical SRCU implementation.
> [ 3.098989][ T5] NMI watchdog: Enabled. Permanently consumes one hw-PMU counter.
> [ 3.099473][ T1] smp: Bringing up secondary CPUs ...
> [ 3.100482][ T1] x86: Booting SMP configuration:
> [ 3.101412][ T1] .... node #0, CPUs: #1
> [ 0.534696][ T0] masked ExtINT on CPU#1
> [ 3.110490][ T1] #2
> [ 0.534696][ T0] masked ExtINT on CPU#2
> [ 3.117413][ T1] #3
> [ 0.534696][ T0] masked ExtINT on CPU#3
> [ 3.124184][ T1] #4
> [ 0.534696][ T0] masked ExtINT on CPU#4
> [ 3.130933][ T1] MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.
> [ 3.131497][ T1] #5
> [ 0.534696][ T0] masked ExtINT on CPU#5
> [ 3.138269][ T1] #6
> [ 0.534696][ T0] masked ExtINT on CPU#6
> [ 3.145166][ T1] #7
> [ 0.534696][ T0] masked ExtINT on CPU#7
> [ 3.152059][ T1] smp: Brought up 1 node, 8 CPUs
> [ 3.152412][ T1] smpboot: Max logical packages: 1
> [ 3.153411][ T1] smpboot: Total of 8 processors activated (54273.93 BogoMIPS)
> [ 3.177958][ T60] node 0 deferred pages initialised in 22ms
> [ 3.184695][ T1] devtmpfs: initialized
> [ 3.185443][ T1] x86/mm: Memory block size: 128MB
> [ 3.187216][ T1] ACPI: PM: Registering ACPI NVS region [mem 0xd1695000-0xd169bfff] (28672 bytes)
> [ 3.187412][ T1] ACPI: PM: Registering ACPI NVS region [mem 0xda71c000-0xda7fffff] (933888 bytes)
> [ 3.188450][ T1] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 1911260446275000 ns
> [ 3.189413][ T1] futex hash table entries: 2048 (order: 5, 131072 bytes, linear)
> [ 3.190454][ T1] pinctrl core: initialized pinctrl subsystem
> [ 3.191527][ T1] NET: Registered PF_NETLINK/PF_ROUTE protocol family
> [ 3.192555][ T1] audit: initializing netlink subsys (disabled)
> [ 3.193423][ T71] audit: type=2000 audit(1636161192.177:1): state=initialized audit_enabled=0 res=1
> [ 3.193471][ T1] thermal_sys: Registered thermal governor 'fair_share'
> [ 3.194412][ T1] thermal_sys: Registered thermal governor 'bang_bang'
> [ 3.195411][ T1] thermal_sys: Registered thermal governor 'step_wise'
> [ 3.196411][ T1] thermal_sys: Registered thermal governor 'user_space'
> [ 3.197420][ T1] cpuidle: using governor menu
> [ 3.199511][ T1] ACPI: bus type PCI registered
> [ 3.200412][ T1] acpiphp: ACPI Hot Plug PCI Controller Driver version: 0.5
> [ 3.201454][ T1] PCI: MMCONFIG for domain 0000 [bus 00-3f] at [mem 0xf8000000-0xfbffffff] (base 0xf8000000)
> [ 3.202412][ T1] PCI: MMCONFIG at [mem 0xf8000000-0xfbffffff] reserved in E820
> [ 3.203416][ T1] pmd_set_huge: Cannot satisfy [mem 0xf8000000-0xf8200000] with a huge-page mapping due to MTRR override.
> [ 3.204444][ T1] PCI: Using configuration type 1 for base access
> [ 3.205522][ T1] core: PMU erratum BJ122, BV98, HSD29 worked around, HT is on
> [ 3.207568][ T1] Kprobes globally optimized
> [ 3.208425][ T1] HugeTLB registered 1.00 GiB page size, pre-allocated 0 pages
> [ 3.209412][ T1] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages
> [ 3.210428][ T1] cryptd: max_cpu_qlen set to 1000
> [ 3.211440][ T1] ACPI: Added _OSI(Module Device)
> [ 3.212415][ T1] ACPI: Added _OSI(Processor Device)
> [ 3.213411][ T1] ACPI: Added _OSI(3.0 _SCP Extensions)
> [ 3.214411][ T1] ACPI: Added _OSI(Processor Aggregator Device)
> [ 3.215411][ T1] ACPI: Added _OSI(Linux-Dell-Video)
> [ 3.216411][ T1] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
> [ 3.217411][ T1] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
> [ 3.224227][ T1] ACPI: 6 ACPI AML tables successfully acquired and loaded
> [ 3.225178][ T1] ACPI: [Firmware Bug]: BIOS _OSI(Linux) query ignored
> [ 3.225803][ T1] ACPI: Dynamic OEM Table Load:
> [ 3.226414][ T1] ACPI: SSDT 0xFFFF88841EAB5000 0003D3 (v01 PmRef Cpu0Cst 00003001 INTL 20120711)
> [ 3.227913][ T1] ACPI: Dynamic OEM Table Load:
> [ 3.228414][ T1] ACPI: SSDT 0xFFFF88841EAC4000 0005AA (v01 PmRef ApIst 00003000 INTL 20120711)
> [ 3.229916][ T1] ACPI: Dynamic OEM Table Load:
> [ 3.230413][ T1] ACPI: SSDT 0xFFFF888100EF8800 000119 (v01 PmRef ApCst 00003000 INTL 20120711)
> [ 3.232725][ T1] ACPI: Interpreter enabled
> [ 3.233432][ T1] ACPI: PM: (supports S0 S3 S4 S5)
> [ 3.234411][ T1] ACPI: Using IOAPIC for interrupt routing
> [ 3.235430][ T1] PCI: Using host bridge windows from ACPI; if necessary, use "pci=nocrs" and report a bug
> [ 3.236560][ T1] ACPI: Enabled 9 GPEs in block 00 to 3F
> [ 3.243366][ T1] ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-3e])
> [ 3.243414][ T1] acpi PNP0A08:00: _OSC: OS supports [ExtendedConfig ASPM ClockPM Segments MSI HPX-Type3]
> [ 3.244781][ T1] acpi PNP0A08:00: _OSC: OS now controls [PCIeHotplug SHPCHotplug PME AER PCIeCapability LTR]
> [ 3.245679][ T1] PCI host bridge to bus 0000:00
> [ 3.246412][ T1] pci_bus 0000:00: root bus resource [io 0x0000-0x0cf7 window]
> [ 3.247411][ T1] pci_bus 0000:00: root bus resource [io 0x0d00-0xffff window]
> [ 3.248411][ T1] pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000bffff window]
> [ 3.249411][ T1] pci_bus 0000:00: root bus resource [mem 0x000d4000-0x000d7fff window]
> [ 3.250411][ T1] pci_bus 0000:00: root bus resource [mem 0x000d8000-0x000dbfff window]
> [ 3.251411][ T1] pci_bus 0000:00: root bus resource [mem 0x000dc000-0x000dffff window]
> [ 3.252411][ T1] pci_bus 0000:00: root bus resource [mem 0x000e0000-0x000e3fff window]
> [ 3.253411][ T1] pci_bus 0000:00: root bus resource [mem 0x000e4000-0x000e7fff window]
> [ 3.254411][ T1] pci_bus 0000:00: root bus resource [mem 0xdf200000-0xfeafffff window]
> [ 3.255411][ T1] pci_bus 0000:00: root bus resource [bus 00-3e]
> [ 3.256436][ T1] pci 0000:00:00.0: [8086:0c00] type 00 class 0x060000
> [ 3.257508][ T1] pci 0000:00:02.0: [8086:0412] type 00 class 0x030000
> [ 3.258416][ T1] pci 0000:00:02.0: reg 0x10: [mem 0xf7800000-0xf7bfffff 64bit]
> [ 3.259414][ T1] pci 0000:00:02.0: reg 0x18: [mem 0xe0000000-0xefffffff 64bit pref]
>
>
> To reproduce:
>
> git clone https://github.com/intel/lkp-tests.git
> cd lkp-tests
> sudo bin/lkp install job.yaml # job file is attached in this email
> bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
> sudo bin/lkp run generated-yaml-file
>
> # if come across any failure that blocks the test,
> # please remove ~/.lkp and /lkp dir to run from a clean state.
>
>
>
> ---
> 0DAY/LKP+ Test Infrastructure Open Source Technology Center
> https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation
>
> Thanks,
> Oliver Sang
>
--
Thanks
Yafang
On 06/11/21 15:40, Yafang Shao wrote:
> It seems we'd better take the patch[1] I sent several weeks back.
>
> [1]. https://lore.kernel.org/lkml/[email protected]/
>
As Peter stated in that thread, this only reduces the race window and
doesn't eliminate it. The FIFO-1 smpboot idea is still a good one IMO.
> --
> Thanks
> Yafang
On Tue, Nov 9, 2021 at 6:47 PM Valentin Schneider
<[email protected]> wrote:
>
> On 06/11/21 15:40, Yafang Shao wrote:
> > It seems we'd better take the patch[1] I sent several weeks back.
> >
> > [1]. https://lore.kernel.org/lkml/[email protected]/
> >
>
> As Peter stated in that thread, this only reduces the race window and
> doesn't eliminate it. The FIFO-1 smpboot idea is still a good one IMO.
>
Of course.
I will think about how to avoid the migration happening in the smpboot
thread_fn.
--
Thanks
Yafang