2023-08-11 11:57:10

by Valentin Schneider

[permalink] [raw]
Subject: [PATCH] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
that has an empty pushable_tasks list, which means nothing useful will be
done in the IPI other than queue the work for the next CPU on the rto_mask.

rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
but the conditions for that irq_work to be queued (and for a CPU to be
added to the rto_mask) rely on rq_rt->nr_migratory instead.

nr_migratory is increased whenever an RT task entity is enqueued and it has
nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
rt_rq's current task. This means a rt_rq can have a migratible current, N
non-migratible queued tasks, and be flagged as overloaded / have its CPU
set in the rto_mask, despite having an empty pushable_tasks list.

Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.

Note that the case where the current task is pushed away to make way for a
migration-disabled task remains unchanged: the migration-disabled task has
to be in the pushable_tasks list in the first place, which means it has
nr_cpus_allowed > 1.

Link: http://lore.kernel.org/r/[email protected]
Reported-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
---
This is lightly tested, this looks to be working OK but I don't have nor am
I aware of a test case for RT balancing, I suppose we want something that
asserts we always run the N highest prio tasks for N CPUs, with a small
margin for migrations?
---
kernel/sched/debug.c | 3 --
kernel/sched/rt.c | 70 +++++++-------------------------------------
kernel/sched/sched.h | 2 --
3 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4c3d0d9f3db63..b4f6fb592a123 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))

PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e50741153..12100f3b6e5f2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -143,7 +143,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}

-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +390,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}

static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +408,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}

@@ -464,16 +426,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -1281,7 +1233,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);

inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}

@@ -1294,7 +1245,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9c5035ca3b06d..bea6a9ec8cde0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -675,8 +675,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned int rt_nr_migratory;
- unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;

--
2.31.1



Subject: Re: [PATCH] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

On 2023-09-11 12:54:50 [+0200], Valentin Schneider wrote:
> Ok, back to this :)
>
> On 15/08/23 16:21, Sebastian Andrzej Siewior wrote:
> > What I still observe is:
> > - CPU0 is idle. CPU0 gets a task assigned from CPU1. That task receives
> > a wakeup. CPU0 returns from idle and schedules the task.
> > pull_rt_task() on CPU1 and sometimes on other CPU observe this, too.
> > CPU1 sends irq_work to CPU0 while at the time rto_next_cpu() sees that
> > has_pushable_tasks() return 0. That bit was cleared earlier (as per
> > tracing).
> >
> > - CPU0 is idle. CPU0 gets a task assigned from CPU1. The task on CPU0 is
> > woken up without an IPI (yay). But then pull_rt_task() decides that
> > send irq_work and has_pushable_tasks() said that is has tasks left
> > so….
> > Now: rto_push_irq_work_func() run once once on CPU0, does nothing,
> > rto_next_cpu() return CPU0 again and enqueues itself again on CPU0.
> > Usually after the second or third round the scheduler on CPU0 makes
> > enough progress to remove the task/ clear the CPU from mask.
> >
>
> If CPU0 is selected for the push IPI, then we should have
>
> rd->rto_cpu == CPU0
>
> So per the
>
> cpumask_next(rd->rto_cpu, rd->rto_mask);
>
> in rto_next_cpu(), it shouldn't be able to re-select itself.
>
> Do you have a simple enough reproducer I could use to poke at this?

Not really a reproducer. What I had earlier was a high priority RT task
(ntpsec at prio 99) and cyclictest below it (prio 90). And PREEMPT_RT
which adds a few tasks (due to threaded interrupts).
Then I added trace-printks to observe. Initially I had latency spikes
due to ntpsec but also a bunch IRQ-work-IPIs which I decided to look at.

> > I understand that there is a race and the CPU is cleared from rto_mask
> > shortly after checking. Therefore I would suggest to look at
> > has_pushable_tasks() before returning a CPU in rto_next_cpu() as I did
> > just to avoid the interruption which does nothing.
> >
> > For the second case the irq_work seems to make no progress. I don't see
> > any trace_events in hardirq, the mask is cleared outside hardirq (idle
> > code). The NEED_RESCHED bit is set for current therefore it doesn't make
> > sense to send irq_work to reschedule if the current already has this on
> > its agenda.
> >
> > So what about something like:
> >
> > diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> > index 00e0e50741153..d963408855e25 100644
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -2247,8 +2247,23 @@ static int rto_next_cpu(struct root_domain *rd)
> >
> > rd->rto_cpu = cpu;
> >
> > - if (cpu < nr_cpu_ids)
> > + if (cpu < nr_cpu_ids) {
> > + struct task_struct *t;
> > +
> > + if (!has_pushable_tasks(cpu_rq(cpu)))
> > + continue;
> > +
>
> IIUC that's just to plug the race between the CPU emptying its
> pushable_tasks list and it removing itself from the rto_mask - that looks
> fine to me.
>
> > + rcu_read_lock();
> > + t = rcu_dereference(rq->curr);
> > + /* if (test_preempt_need_resched_cpu(cpu_rq(cpu))) */
> > + if (test_tsk_need_resched(t)) {
>
> We need to make sure this doesn't cause us to loose IPIs we actually need.
>
> We do have a call to put_prev_task_balance() through entering __schedule()
> if the previous task is RT/DL, and balance_rt() can issue a push
> IPI, but AFAICT only if the previous task was the last DL task. So I don't
> think we can do this.

I observed that the CPU/ task on that CPU already had the need-resched
bit set so a task-switch is in progress. Therefore it looks like any
further IPIs are needless because the IRQ-work IPI just "leave early"
via resched_curr() and don't do anything useful. So they don't
contribute anything but stall the CPU from making progress and
performing the actual context switch.

> > + rcu_read_unlock();
> > + continue;
> > + }
> > + rcu_read_unlock();
> > +
> > return cpu;
> > + }
> >
> > rd->rto_cpu = -1;

Sebastian

Subject: [tip: sched/core] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 612f769edd06a6e42f7cd72425488e68ddaeef0a
Gitweb: https://git.kernel.org/tip/612f769edd06a6e42f7cd72425488e68ddaeef0a
Author: Valentin Schneider <[email protected]>
AuthorDate: Fri, 11 Aug 2023 12:20:44 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 25 Sep 2023 10:25:29 +02:00

sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
that has an empty pushable_tasks list, which means nothing useful will be
done in the IPI other than queue the work for the next CPU on the rto_mask.

rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
but the conditions for that irq_work to be queued (and for a CPU to be
added to the rto_mask) rely on rq_rt->nr_migratory instead.

nr_migratory is increased whenever an RT task entity is enqueued and it has
nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
rt_rq's current task. This means a rt_rq can have a migratible current, N
non-migratible queued tasks, and be flagged as overloaded / have its CPU
set in the rto_mask, despite having an empty pushable_tasks list.

Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.

Note that the case where the current task is pushed away to make way for a
migration-disabled task remains unchanged: the migration-disabled task has
to be in the pushable_tasks list in the first place, which means it has
nr_cpus_allowed > 1.

Reported-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Tested-by: Sebastian Andrzej Siewior <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/sched/debug.c | 3 +--
kernel/sched/rt.c | 70 ++++++-------------------------------------
kernel/sched/sched.h | 2 +-
3 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5e34a8c..c4253bd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))

PU(rt_nr_running);
-#ifdef CONFIG_SMP
- PU(rt_nr_migratory);
-#endif
P(rt_throttled);
PN(rt_time);
PN(rt_runtime);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3e442fa..3b627ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -143,7 +143,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
- rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif /* CONFIG_SMP */
@@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}

-static void update_rt_migration(struct rt_rq *rt_rq)
-{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
- if (!rt_rq->overloaded) {
- rt_set_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 1;
- }
- } else if (rt_rq->overloaded) {
- rt_clear_overload(rq_of_rt_rq(rt_rq));
- rt_rq->overloaded = 0;
- }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory++;
-
- update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- struct task_struct *p;
-
- if (!rt_entity_is_task(rt_se))
- return;
-
- p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
- rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
- rt_rq->rt_nr_migratory--;
-
- update_rt_migration(rt_rq);
-}
-
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
@@ -438,6 +390,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
/* Update the highest prio pushable task */
if (p->prio < rq->rt.highest_prio.next)
rq->rt.highest_prio.next = p->prio;
+
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
}

static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -451,6 +408,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
rq->rt.highest_prio.next = p->prio;
} else {
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+
+ if (rq->rt.overloaded) {
+ rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
}

@@ -464,16 +426,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -1281,7 +1233,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);

inc_rt_prio(rt_rq, prio);
- inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}

@@ -1294,7 +1245,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
- dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 96f8ab7..41d760d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,8 +663,6 @@ struct rt_rq {
} highest_prio;
#endif
#ifdef CONFIG_SMP
- unsigned int rt_nr_migratory;
- unsigned int rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;

2023-09-25 10:22:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

On Mon, Sep 25, 2023 at 08:55:10AM -0000, tip-bot2 for Valentin Schneider wrote:
> The following commit has been merged into the sched/core branch of tip:
>
> Commit-ID: 612f769edd06a6e42f7cd72425488e68ddaeef0a
> Gitweb: https://git.kernel.org/tip/612f769edd06a6e42f7cd72425488e68ddaeef0a
> Author: Valentin Schneider <[email protected]>
> AuthorDate: Fri, 11 Aug 2023 12:20:44 +01:00
> Committer: Ingo Molnar <[email protected]>
> CommitterDate: Mon, 25 Sep 2023 10:25:29 +02:00
>
> sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask
>
> Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
> that has an empty pushable_tasks list, which means nothing useful will be
> done in the IPI other than queue the work for the next CPU on the rto_mask.
>
> rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
> but the conditions for that irq_work to be queued (and for a CPU to be
> added to the rto_mask) rely on rq_rt->nr_migratory instead.
>
> nr_migratory is increased whenever an RT task entity is enqueued and it has
> nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
> rt_rq's current task. This means a rt_rq can have a migratible current, N
> non-migratible queued tasks, and be flagged as overloaded / have its CPU
> set in the rto_mask, despite having an empty pushable_tasks list.
>
> Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
> Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.
>
> Note that the case where the current task is pushed away to make way for a
> migration-disabled task remains unchanged: the migration-disabled task has
> to be in the pushable_tasks list in the first place, which means it has
> nr_cpus_allowed > 1.
>
> Reported-by: Sebastian Andrzej Siewior <[email protected]>
> Signed-off-by: Valentin Schneider <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
> Tested-by: Sebastian Andrzej Siewior <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]
> ---
> kernel/sched/debug.c | 3 +--
> kernel/sched/rt.c | 70 ++++++-------------------------------------
> kernel/sched/sched.h | 2 +-
> 3 files changed, 10 insertions(+), 65 deletions(-)
>

> @@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
> cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
> }
>
> -static void update_rt_migration(struct rt_rq *rt_rq)
> -{
> - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
> - if (!rt_rq->overloaded) {
> - rt_set_overload(rq_of_rt_rq(rt_rq));
> - rt_rq->overloaded = 1;
> - }
> - } else if (rt_rq->overloaded) {
> - rt_clear_overload(rq_of_rt_rq(rt_rq));
> - rt_rq->overloaded = 0;
> - }
> -}
> -
> -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> -{
> - struct task_struct *p;
> -
> - if (!rt_entity_is_task(rt_se))
> - return;
> -
> - p = rt_task_of(rt_se);
> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
> -
> - rt_rq->rt_nr_total++;
> - if (p->nr_cpus_allowed > 1)
> - rt_rq->rt_nr_migratory++;
> -
> - update_rt_migration(rt_rq);
> -}
> -
> -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
> -{
> - struct task_struct *p;
> -
> - if (!rt_entity_is_task(rt_se))
> - return;
> -
> - p = rt_task_of(rt_se);
> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
> -
> - rt_rq->rt_nr_total--;
> - if (p->nr_cpus_allowed > 1)
> - rt_rq->rt_nr_migratory--;
> -
> - update_rt_migration(rt_rq);
> -}

sched/deadline.c has something very similar, does that need updating
too?

2023-09-25 13:10:59

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

On 25/09/23 10:27, Ingo Molnar wrote:
> * Sebastian Andrzej Siewior <[email protected]> wrote:
>
>> On 2023-08-11 12:20:44 [+0100], Valentin Schneider wrote:
>> > Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
>> > that has an empty pushable_tasks list, which means nothing useful will be
>> > done in the IPI other than queue the work for the next CPU on the rto_mask.
>> >
>> > rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
>> > but the conditions for that irq_work to be queued (and for a CPU to be
>> > added to the rto_mask) rely on rq_rt->nr_migratory instead.
>> >
>> > nr_migratory is increased whenever an RT task entity is enqueued and it has
>> > nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
>> > rt_rq's current task. This means a rt_rq can have a migratible current, N
>> > non-migratible queued tasks, and be flagged as overloaded / have its CPU
>> > set in the rto_mask, despite having an empty pushable_tasks list.
>> >
>> > Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
>> > Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.
>> >
>> > Note that the case where the current task is pushed away to make way for a
>> > migration-disabled task remains unchanged: the migration-disabled task has
>> > to be in the pushable_tasks list in the first place, which means it has
>> > nr_cpus_allowed > 1.
>> >
>> > Link: http://lore.kernel.org/r/[email protected]
>> > Reported-by: Sebastian Andrzej Siewior <[email protected]>
>> > Signed-off-by: Valentin Schneider <[email protected]>
>> > ---
>> > This is lightly tested, this looks to be working OK but I don't have nor am
>> > I aware of a test case for RT balancing, I suppose we want something that
>> > asserts we always run the N highest prio tasks for N CPUs, with a small
>> > margin for migrations?
>>
>> I don't see the storm of IPIs I saw before. So as far that goes:
>> Tested-by: Sebastian Andrzej Siewior <[email protected]>
>
> I've applied Valentin's initial fix to tip:sched/core, for an eventual
> v6.7 merge, as it addresses the IPI storm bug. Let me know if merging
> this is not desirable for some reason.
>
>> What I still observe is:
>> - CPU0 is idle. CPU0 gets a task assigned from CPU1. That task receives
>> a wakeup. CPU0 returns from idle and schedules the task.
>> pull_rt_task() on CPU1 and sometimes on other CPU observe this, too.
>> CPU1 sends irq_work to CPU0 while at the time rto_next_cpu() sees that
>> has_pushable_tasks() return 0. That bit was cleared earlier (as per
>> tracing).
>>
>> - CPU0 is idle. CPU0 gets a task assigned from CPU1. The task on CPU0 is
>> woken up without an IPI (yay). But then pull_rt_task() decides that
>> send irq_work and has_pushable_tasks() said that is has tasks left
>> so….
>> Now: rto_push_irq_work_func() run once once on CPU0, does nothing,
>> rto_next_cpu() return CPU0 again and enqueues itself again on CPU0.
>> Usually after the second or third round the scheduler on CPU0 makes
>> enough progress to remove the task/ clear the CPU from mask.
>
> Just curious, any progress on solving this?
>

On my side not really, I need to stop getting distracted and probably get
this to reproduce on a system so I can understand-by-tracing

> Thanks,
>
> Ingo

2023-09-25 13:59:03

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask


* Sebastian Andrzej Siewior <[email protected]> wrote:

> On 2023-08-11 12:20:44 [+0100], Valentin Schneider wrote:
> > Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
> > that has an empty pushable_tasks list, which means nothing useful will be
> > done in the IPI other than queue the work for the next CPU on the rto_mask.
> >
> > rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
> > but the conditions for that irq_work to be queued (and for a CPU to be
> > added to the rto_mask) rely on rq_rt->nr_migratory instead.
> >
> > nr_migratory is increased whenever an RT task entity is enqueued and it has
> > nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
> > rt_rq's current task. This means a rt_rq can have a migratible current, N
> > non-migratible queued tasks, and be flagged as overloaded / have its CPU
> > set in the rto_mask, despite having an empty pushable_tasks list.
> >
> > Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
> > Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.
> >
> > Note that the case where the current task is pushed away to make way for a
> > migration-disabled task remains unchanged: the migration-disabled task has
> > to be in the pushable_tasks list in the first place, which means it has
> > nr_cpus_allowed > 1.
> >
> > Link: http://lore.kernel.org/r/[email protected]
> > Reported-by: Sebastian Andrzej Siewior <[email protected]>
> > Signed-off-by: Valentin Schneider <[email protected]>
> > ---
> > This is lightly tested, this looks to be working OK but I don't have nor am
> > I aware of a test case for RT balancing, I suppose we want something that
> > asserts we always run the N highest prio tasks for N CPUs, with a small
> > margin for migrations?
>
> I don't see the storm of IPIs I saw before. So as far that goes:
> Tested-by: Sebastian Andrzej Siewior <[email protected]>

I've applied Valentin's initial fix to tip:sched/core, for an eventual
v6.7 merge, as it addresses the IPI storm bug. Let me know if merging
this is not desirable for some reason.

> What I still observe is:
> - CPU0 is idle. CPU0 gets a task assigned from CPU1. That task receives
> a wakeup. CPU0 returns from idle and schedules the task.
> pull_rt_task() on CPU1 and sometimes on other CPU observe this, too.
> CPU1 sends irq_work to CPU0 while at the time rto_next_cpu() sees that
> has_pushable_tasks() return 0. That bit was cleared earlier (as per
> tracing).
>
> - CPU0 is idle. CPU0 gets a task assigned from CPU1. The task on CPU0 is
> woken up without an IPI (yay). But then pull_rt_task() decides that
> send irq_work and has_pushable_tasks() said that is has tasks left
> so….
> Now: rto_push_irq_work_func() run once once on CPU0, does nothing,
> rto_next_cpu() return CPU0 again and enqueues itself again on CPU0.
> Usually after the second or third round the scheduler on CPU0 makes
> enough progress to remove the task/ clear the CPU from mask.

Just curious, any progress on solving this?

Thanks,

Ingo

2023-09-25 17:25:32

by Valentin Schneider

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask

On 25/09/23 12:11, Peter Zijlstra wrote:
> On Mon, Sep 25, 2023 at 08:55:10AM -0000, tip-bot2 for Valentin Schneider wrote:
>> The following commit has been merged into the sched/core branch of tip:
>>
>> Commit-ID: 612f769edd06a6e42f7cd72425488e68ddaeef0a
>> Gitweb: https://git.kernel.org/tip/612f769edd06a6e42f7cd72425488e68ddaeef0a
>> Author: Valentin Schneider <[email protected]>
>> AuthorDate: Fri, 11 Aug 2023 12:20:44 +01:00
>> Committer: Ingo Molnar <[email protected]>
>> CommitterDate: Mon, 25 Sep 2023 10:25:29 +02:00
>>
>> sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask
>>
>> Sebastian noted that the rto_push_work IRQ work can be queued for a CPU
>> that has an empty pushable_tasks list, which means nothing useful will be
>> done in the IPI other than queue the work for the next CPU on the rto_mask.
>>
>> rto_push_irq_work_func() only operates on tasks in the pushable_tasks list,
>> but the conditions for that irq_work to be queued (and for a CPU to be
>> added to the rto_mask) rely on rq_rt->nr_migratory instead.
>>
>> nr_migratory is increased whenever an RT task entity is enqueued and it has
>> nr_cpus_allowed > 1. Unlike the pushable_tasks list, nr_migratory includes a
>> rt_rq's current task. This means a rt_rq can have a migratible current, N
>> non-migratible queued tasks, and be flagged as overloaded / have its CPU
>> set in the rto_mask, despite having an empty pushable_tasks list.
>>
>> Make an rt_rq's overload logic be driven by {enqueue,dequeue}_pushable_task().
>> Since rt_rq->{rt_nr_migratory,rt_nr_total} become unused, remove them.
>>
>> Note that the case where the current task is pushed away to make way for a
>> migration-disabled task remains unchanged: the migration-disabled task has
>> to be in the pushable_tasks list in the first place, which means it has
>> nr_cpus_allowed > 1.
>>
>> Reported-by: Sebastian Andrzej Siewior <[email protected]>
>> Signed-off-by: Valentin Schneider <[email protected]>
>> Signed-off-by: Ingo Molnar <[email protected]>
>> Tested-by: Sebastian Andrzej Siewior <[email protected]>
>> Link: https://lore.kernel.org/r/[email protected]
>> ---
>> kernel/sched/debug.c | 3 +--
>> kernel/sched/rt.c | 70 ++++++-------------------------------------
>> kernel/sched/sched.h | 2 +-
>> 3 files changed, 10 insertions(+), 65 deletions(-)
>>
>
>> @@ -358,53 +357,6 @@ static inline void rt_clear_overload(struct rq *rq)
>> cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
>> }
>>
>> -static void update_rt_migration(struct rt_rq *rt_rq)
>> -{
>> - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
>> - if (!rt_rq->overloaded) {
>> - rt_set_overload(rq_of_rt_rq(rt_rq));
>> - rt_rq->overloaded = 1;
>> - }
>> - } else if (rt_rq->overloaded) {
>> - rt_clear_overload(rq_of_rt_rq(rt_rq));
>> - rt_rq->overloaded = 0;
>> - }
>> -}
>> -
>> -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> -{
>> - struct task_struct *p;
>> -
>> - if (!rt_entity_is_task(rt_se))
>> - return;
>> -
>> - p = rt_task_of(rt_se);
>> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
>> -
>> - rt_rq->rt_nr_total++;
>> - if (p->nr_cpus_allowed > 1)
>> - rt_rq->rt_nr_migratory++;
>> -
>> - update_rt_migration(rt_rq);
>> -}
>> -
>> -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>> -{
>> - struct task_struct *p;
>> -
>> - if (!rt_entity_is_task(rt_se))
>> - return;
>> -
>> - p = rt_task_of(rt_se);
>> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
>> -
>> - rt_rq->rt_nr_total--;
>> - if (p->nr_cpus_allowed > 1)
>> - rt_rq->rt_nr_migratory--;
>> -
>> - update_rt_migration(rt_rq);
>> -}
>
> sched/deadline.c has something very similar, does that need updating
> too?

Hm I think so yes:
- push_dl_task() is an obvious noop if the pushable tree is empty
- pull_dl_task() can be kicked if !rq->dl.overloaded, which similarly to rt
is driven by nr_migratory but could be boiled down to having pushable
tasks (due to the nr_cpus_allowed > 1 constraint).

Lemme poke at it.

2023-09-29 04:51:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip: sched/core] sched/rt: Make rt_rq->pushable_tasks updates drive rto_mask


* Valentin Schneider <[email protected]> wrote:

> > sched/deadline.c has something very similar, does that need updating
> > too?
>
> Hm I think so yes:
> - push_dl_task() is an obvious noop if the pushable tree is empty
> - pull_dl_task() can be kicked if !rq->dl.overloaded, which similarly to rt
> is driven by nr_migratory but could be boiled down to having pushable
> tasks (due to the nr_cpus_allowed > 1 constraint).
>
> Lemme poke at it.

For reference, the matching DL change now lives in tip:sched/core as:

177f608a4c82 ("sched/deadline: Make dl_rq->pushable_dl_tasks update drive dl_rq->overloaded")

Thanks,

Ingo