From: Peter Zijlstra <[email protected]>
Because sched_class::pick_next_task() also implies
sched_class::set_next_task() (and possibly put_prev_task() and
newidle_balance) it is not state invariant. This makes it unsuitable
for remote task selection.
Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/sched/deadline.c | 16 ++++++++++++++--
kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
kernel/sched/idle.c | 8 ++++++++
kernel/sched/rt.c | 14 ++++++++++++--
kernel/sched/sched.h | 3 +++
kernel/sched/stop_task.c | 13 +++++++++++--
6 files changed, 79 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 814ec49502b1..0271a7848ab3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1848,7 +1848,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -1860,7 +1860,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+ struct task_struct *p;
+
+ p = pick_task_dl(rq);
+ if (p)
+ set_next_task_dl(rq, p, true);
+
return p;
}
@@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
#ifdef CONFIG_SMP
.balance = balance_dl,
+ .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbd9368a959d..bd6aed63f5e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
@@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
set_last_buddy(se);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ se = pick_next_entity(cfs_rq, NULL);
+
+ if (curr) {
+ if (se && curr->on_rq)
+ update_curr(cfs_rq);
+
+ if (!se || entity_before(curr, se))
+ se = curr;
+ }
+
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -11173,6 +11202,7 @@ const struct sched_class fair_sched_class
#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8ce6e80352cf..ce7552c6bc65 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -405,6 +405,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
schedstat_inc(rq->sched_goidle);
}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+#endif
+
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
@@ -472,6 +479,7 @@ const struct sched_class idle_sched_class
#ifdef CONFIG_SMP
.balance = balance_idle,
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e57fca05b660..a5851c775270 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1624,7 +1624,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
@@ -1632,7 +1632,16 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = pick_task_rt(rq);
+ if (p)
+ set_next_task_rt(rq, p, true);
+
return p;
}
@@ -2443,6 +2452,7 @@ const struct sched_class rt_sched_class
#ifdef CONFIG_SMP
.balance = balance_rt,
+ .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 587ebabebaff..54bfac702805 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1800,6 +1800,9 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 394bc8126a1e..8f92915dd95e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -34,15 +34,23 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;
- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *p = pick_task_stop(rq);
+ if (p)
+ set_next_task_stop(rq, p, true);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -124,6 +132,7 @@ const struct sched_class stop_sched_class
#ifdef CONFIG_SMP
.balance = balance_stop,
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
--
2.29.0.rc1.297.gfa9743e501-goog
On 2020/10/20 9:43, Joel Fernandes (Google) wrote:
> From: Peter Zijlstra <[email protected]>
>
> Because sched_class::pick_next_task() also implies
> sched_class::set_next_task() (and possibly put_prev_task() and
> newidle_balance) it is not state invariant. This makes it unsuitable
> for remote task selection.
>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> kernel/sched/deadline.c | 16 ++++++++++++++--
> kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
> kernel/sched/idle.c | 8 ++++++++
> kernel/sched/rt.c | 14 ++++++++++++--
> kernel/sched/sched.h | 3 +++
> kernel/sched/stop_task.c | 13 +++++++++++--
> 6 files changed, 79 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 814ec49502b1..0271a7848ab3 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1848,7 +1848,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
> return rb_entry(left, struct sched_dl_entity, rb_node);
> }
>
> -static struct task_struct *pick_next_task_dl(struct rq *rq)
> +static struct task_struct *pick_task_dl(struct rq *rq)
> {
> struct sched_dl_entity *dl_se;
> struct dl_rq *dl_rq = &rq->dl;
> @@ -1860,7 +1860,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> dl_se = pick_next_dl_entity(rq, dl_rq);
> BUG_ON(!dl_se);
> p = dl_task_of(dl_se);
> - set_next_task_dl(rq, p, true);
> +
> + return p;
> +}
> +
> +static struct task_struct *pick_next_task_dl(struct rq *rq)
> +{
> + struct task_struct *p;
> +
> + p = pick_task_dl(rq);
> + if (p)
> + set_next_task_dl(rq, p, true);
> +
> return p;
> }
>
> @@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
>
> #ifdef CONFIG_SMP
> .balance = balance_dl,
> + .pick_task = pick_task_dl,
> .select_task_rq = select_task_rq_dl,
> .migrate_task_rq = migrate_task_rq_dl,
> .set_cpus_allowed = set_cpus_allowed_dl,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dbd9368a959d..bd6aed63f5e3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> * Avoid running the skip buddy, if running something else can
> * be done without getting too unfair.
> */
> - if (cfs_rq->skip == se) {
> + if (cfs_rq->skip && cfs_rq->skip == se) {
> struct sched_entity *second;
>
> if (se == curr) {
> @@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> set_last_buddy(se);
> }
>
> +#ifdef CONFIG_SMP
> +static struct task_struct *pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = &rq->cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);
> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +
> + if (!se || entity_before(curr, se))
> + se = curr;
> + }
> +
> + cfs_rq = group_cfs_rq(se);
> + } while (cfs_rq);
> +
> + return task_of(se);
> +}
> +#endif
One of my machines hangs when I run uperf with only one message:
[ 719.034962] BUG: kernel NULL pointer dereference, address: 0000000000000050
Then I replicated the problem on my another machine(no serial console),
here is the stack by manual copy.
Call Trace:
pick_next_entity+0xb0/0x160
pick_task_fair+0x4b/0x90
__schedule+0x59b/0x12f0
schedule_idle+0x1e/0x40
do_idle+0x193/0x2d0
cpu_startup_entry+0x19/0x20
start_secondary+0x110/0x150
secondary_startup_64_no_verify+0xa6/0xab
> +
> struct task_struct *
> pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> @@ -11173,6 +11202,7 @@ const struct sched_class fair_sched_class
>
> #ifdef CONFIG_SMP
> .balance = balance_fair,
> + .pick_task = pick_task_fair,
> .select_task_rq = select_task_rq_fair,
> .migrate_task_rq = migrate_task_rq_fair,
>
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 8ce6e80352cf..ce7552c6bc65 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -405,6 +405,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
> schedstat_inc(rq->sched_goidle);
> }
>
> +#ifdef CONFIG_SMP
> +static struct task_struct *pick_task_idle(struct rq *rq)
> +{
> + return rq->idle;
> +}
> +#endif
> +
> struct task_struct *pick_next_task_idle(struct rq *rq)
> {
> struct task_struct *next = rq->idle;
> @@ -472,6 +479,7 @@ const struct sched_class idle_sched_class
>
> #ifdef CONFIG_SMP
> .balance = balance_idle,
> + .pick_task = pick_task_idle,
> .select_task_rq = select_task_rq_idle,
> .set_cpus_allowed = set_cpus_allowed_common,
> #endif
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index e57fca05b660..a5851c775270 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1624,7 +1624,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
> return rt_task_of(rt_se);
> }
>
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *pick_task_rt(struct rq *rq)
> {
> struct task_struct *p;
>
> @@ -1632,7 +1632,16 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
> return NULL;
>
> p = _pick_next_task_rt(rq);
> - set_next_task_rt(rq, p, true);
> +
> + return p;
> +}
> +
> +static struct task_struct *pick_next_task_rt(struct rq *rq)
> +{
> + struct task_struct *p = pick_task_rt(rq);
> + if (p)
> + set_next_task_rt(rq, p, true);
> +
> return p;
> }
>
> @@ -2443,6 +2452,7 @@ const struct sched_class rt_sched_class
>
> #ifdef CONFIG_SMP
> .balance = balance_rt,
> + .pick_task = pick_task_rt,
> .select_task_rq = select_task_rq_rt,
> .set_cpus_allowed = set_cpus_allowed_common,
> .rq_online = rq_online_rt,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 587ebabebaff..54bfac702805 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1800,6 +1800,9 @@ struct sched_class {
>
> #ifdef CONFIG_SMP
> int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
> +
> + struct task_struct * (*pick_task)(struct rq *rq);
> +
> int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
> void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
>
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index 394bc8126a1e..8f92915dd95e 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -34,15 +34,23 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
> stop->se.exec_start = rq_clock_task(rq);
> }
>
> -static struct task_struct *pick_next_task_stop(struct rq *rq)
> +static struct task_struct *pick_task_stop(struct rq *rq)
> {
> if (!sched_stop_runnable(rq))
> return NULL;
>
> - set_next_task_stop(rq, rq->stop, true);
> return rq->stop;
> }
>
> +static struct task_struct *pick_next_task_stop(struct rq *rq)
> +{
> + struct task_struct *p = pick_task_stop(rq);
> + if (p)
> + set_next_task_stop(rq, p, true);
> +
> + return p;
> +}
> +
> static void
> enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> {
> @@ -124,6 +132,7 @@ const struct sched_class stop_sched_class
>
> #ifdef CONFIG_SMP
> .balance = balance_stop,
> + .pick_task = pick_task_stop,
> .select_task_rq = select_task_rq_stop,
> .set_cpus_allowed = set_cpus_allowed_common,
> #endif
>
On Thu, Oct 22, 2020 at 12:59 AM Li, Aubrey <[email protected]> wrote:
>
> On 2020/10/20 9:43, Joel Fernandes (Google) wrote:
> > From: Peter Zijlstra <[email protected]>
> >
> > Because sched_class::pick_next_task() also implies
> > sched_class::set_next_task() (and possibly put_prev_task() and
> > newidle_balance) it is not state invariant. This makes it unsuitable
> > for remote task selection.
> >
> > Tested-by: Julien Desfossez <[email protected]>
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> > Signed-off-by: Julien Desfossez <[email protected]>
> > Signed-off-by: Joel Fernandes (Google) <[email protected]>
> > ---
> > kernel/sched/deadline.c | 16 ++++++++++++++--
> > kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
> > kernel/sched/idle.c | 8 ++++++++
> > kernel/sched/rt.c | 14 ++++++++++++--
> > kernel/sched/sched.h | 3 +++
> > kernel/sched/stop_task.c | 13 +++++++++++--
> > 6 files changed, 79 insertions(+), 7 deletions(-)
> >
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index 814ec49502b1..0271a7848ab3 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1848,7 +1848,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
> > return rb_entry(left, struct sched_dl_entity, rb_node);
> > }
> >
> > -static struct task_struct *pick_next_task_dl(struct rq *rq)
> > +static struct task_struct *pick_task_dl(struct rq *rq)
> > {
> > struct sched_dl_entity *dl_se;
> > struct dl_rq *dl_rq = &rq->dl;
> > @@ -1860,7 +1860,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> > dl_se = pick_next_dl_entity(rq, dl_rq);
> > BUG_ON(!dl_se);
> > p = dl_task_of(dl_se);
> > - set_next_task_dl(rq, p, true);
> > +
> > + return p;
> > +}
> > +
> > +static struct task_struct *pick_next_task_dl(struct rq *rq)
> > +{
> > + struct task_struct *p;
> > +
> > + p = pick_task_dl(rq);
> > + if (p)
> > + set_next_task_dl(rq, p, true);
> > +
> > return p;
> > }
> >
> > @@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
> >
> > #ifdef CONFIG_SMP
> > .balance = balance_dl,
> > + .pick_task = pick_task_dl,
> > .select_task_rq = select_task_rq_dl,
> > .migrate_task_rq = migrate_task_rq_dl,
> > .set_cpus_allowed = set_cpus_allowed_dl,
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index dbd9368a959d..bd6aed63f5e3 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> > * Avoid running the skip buddy, if running something else can
> > * be done without getting too unfair.
> > */
> > - if (cfs_rq->skip == se) {
> > + if (cfs_rq->skip && cfs_rq->skip == se) {
> > struct sched_entity *second;
> >
> > if (se == curr) {
> > @@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> > set_last_buddy(se);
> > }
> >
> > +#ifdef CONFIG_SMP
> > +static struct task_struct *pick_task_fair(struct rq *rq)
> > +{
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + struct sched_entity *se;
> > +
> > + if (!cfs_rq->nr_running)
> > + return NULL;
> > +
> > + do {
> > + struct sched_entity *curr = cfs_rq->curr;
> > +
> > + se = pick_next_entity(cfs_rq, NULL);
> > +
> > + if (curr) {
> > + if (se && curr->on_rq)
> > + update_curr(cfs_rq);
> > +
> > + if (!se || entity_before(curr, se))
> > + se = curr;
> > + }
> > +
> > + cfs_rq = group_cfs_rq(se);
> > + } while (cfs_rq);
> > ++
> > + return task_of(se);
> > +}
> > +#endif
>
> One of my machines hangs when I run uperf with only one message:
> [ 719.034962] BUG: kernel NULL pointer dereference, address: 0000000000000050
>
> Then I replicated the problem on my another machine(no serial console),
> here is the stack by manual copy.
>
> Call Trace:
> pick_next_entity+0xb0/0x160
> pick_task_fair+0x4b/0x90
> __schedule+0x59b/0x12f0
> schedule_idle+0x1e/0x40
> do_idle+0x193/0x2d0
> cpu_startup_entry+0x19/0x20
> start_secondary+0x110/0x150
> secondary_startup_64_no_verify+0xa6/0xab
Interesting. Wondering if we screwed something up in the rebase.
Questions:
1. Does the issue happen if you just apply only up until this patch,
or the entire series?
2. Do you see the issue in v7? Not much if at all has changed in this
part of the code from v7 -> v8 but could be something in the newer
kernel.
We tested this series after rebase heavily so it is indeed strange to
see this so late.
- Joel
>
> > +
> > struct task_struct *
> > pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > {
> > @@ -11173,6 +11202,7 @@ const struct sched_class fair_sched_class
> >
> > #ifdef CONFIG_SMP
> > .balance = balance_fair,
> > + .pick_task = pick_task_fair,
> > .select_task_rq = select_task_rq_fair,
> > .migrate_task_rq = migrate_task_rq_fair,
> >
> > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> > index 8ce6e80352cf..ce7552c6bc65 100644
> > --- a/kernel/sched/idle.c
> > +++ b/kernel/sched/idle.c
> > @@ -405,6 +405,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
> > schedstat_inc(rq->sched_goidle);
> > }
> >
> > +#ifdef CONFIG_SMP
> > +static struct task_struct *pick_task_idle(struct rq *rq)
> > +{
> > + return rq->idle;
> > +}
> > +#endif
> > +
> > struct task_struct *pick_next_task_idle(struct rq *rq)
> > {
> > struct task_struct *next = rq->idle;
> > @@ -472,6 +479,7 @@ const struct sched_class idle_sched_class
> >
> > #ifdef CONFIG_SMP
> > .balance = balance_idle,
> > + .pick_task = pick_task_idle,
> > .select_task_rq = select_task_rq_idle,
> > .set_cpus_allowed = set_cpus_allowed_common,
> > #endif
> > diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> > index e57fca05b660..a5851c775270 100644
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -1624,7 +1624,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
> > return rt_task_of(rt_se);
> > }
> >
> > -static struct task_struct *pick_next_task_rt(struct rq *rq)
> > +static struct task_struct *pick_task_rt(struct rq *rq)
> > {
> > struct task_struct *p;
> >
> > @@ -1632,7 +1632,16 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
> > return NULL;
> >
> > p = _pick_next_task_rt(rq);
> > - set_next_task_rt(rq, p, true);
> > +
> > + return p;
> > +}
> > +
> > +static struct task_struct *pick_next_task_rt(struct rq *rq)
> > +{
> > + struct task_struct *p = pick_task_rt(rq);
> > + if (p)
> > + set_next_task_rt(rq, p, true);
> > +
> > return p;
> > }
> >
> > @@ -2443,6 +2452,7 @@ const struct sched_class rt_sched_class
> >
> > #ifdef CONFIG_SMP
> > .balance = balance_rt,
> > + .pick_task = pick_task_rt,
> > .select_task_rq = select_task_rq_rt,
> > .set_cpus_allowed = set_cpus_allowed_common,
> > .rq_online = rq_online_rt,
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 587ebabebaff..54bfac702805 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1800,6 +1800,9 @@ struct sched_class {
> >
> > #ifdef CONFIG_SMP
> > int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
> > +
> > + struct task_struct * (*pick_task)(struct rq *rq);
> > +
> > int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
> > void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
> >
> > diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> > index 394bc8126a1e..8f92915dd95e 100644
> > --- a/kernel/sched/stop_task.c
> > +++ b/kernel/sched/stop_task.c
> > @@ -34,15 +34,23 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
> > stop->se.exec_start = rq_clock_task(rq);
> > }
> >
> > -static struct task_struct *pick_next_task_stop(struct rq *rq)
> > +static struct task_struct *pick_task_stop(struct rq *rq)
> > {
> > if (!sched_stop_runnable(rq))
> > return NULL;
> >
> > - set_next_task_stop(rq, rq->stop, true);
> > return rq->stop;
> > }
> >
> > +static struct task_struct *pick_next_task_stop(struct rq *rq)
> > +{
> > + struct task_struct *p = pick_task_stop(rq);
> > + if (p)
> > + set_next_task_stop(rq, p, true);
> > +
> > + return p;
> > +}
> > +
> > static void
> > enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> > {
> > @@ -124,6 +132,7 @@ const struct sched_class stop_sched_class
> >
> > #ifdef CONFIG_SMP
> > .balance = balance_stop,
> > + .pick_task = pick_task_stop,
> > .select_task_rq = select_task_rq_stop,
> > .set_cpus_allowed = set_cpus_allowed_common,
> > #endif
> >
>
On 2020/10/22 23:25, Joel Fernandes wrote:
> On Thu, Oct 22, 2020 at 12:59 AM Li, Aubrey <[email protected]> wrote:
>>
>> On 2020/10/20 9:43, Joel Fernandes (Google) wrote:
>>> From: Peter Zijlstra <[email protected]>
>>>
>>> Because sched_class::pick_next_task() also implies
>>> sched_class::set_next_task() (and possibly put_prev_task() and
>>> newidle_balance) it is not state invariant. This makes it unsuitable
>>> for remote task selection.
>>>
>>> Tested-by: Julien Desfossez <[email protected]>
>>> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>>> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
>>> Signed-off-by: Julien Desfossez <[email protected]>
>>> Signed-off-by: Joel Fernandes (Google) <[email protected]>
>>> ---
>>> kernel/sched/deadline.c | 16 ++++++++++++++--
>>> kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
>>> kernel/sched/idle.c | 8 ++++++++
>>> kernel/sched/rt.c | 14 ++++++++++++--
>>> kernel/sched/sched.h | 3 +++
>>> kernel/sched/stop_task.c | 13 +++++++++++--
>>> 6 files changed, 79 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
>>> index 814ec49502b1..0271a7848ab3 100644
>>> --- a/kernel/sched/deadline.c
>>> +++ b/kernel/sched/deadline.c
>>> @@ -1848,7 +1848,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
>>> return rb_entry(left, struct sched_dl_entity, rb_node);
>>> }
>>>
>>> -static struct task_struct *pick_next_task_dl(struct rq *rq)
>>> +static struct task_struct *pick_task_dl(struct rq *rq)
>>> {
>>> struct sched_dl_entity *dl_se;
>>> struct dl_rq *dl_rq = &rq->dl;
>>> @@ -1860,7 +1860,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
>>> dl_se = pick_next_dl_entity(rq, dl_rq);
>>> BUG_ON(!dl_se);
>>> p = dl_task_of(dl_se);
>>> - set_next_task_dl(rq, p, true);
>>> +
>>> + return p;
>>> +}
>>> +
>>> +static struct task_struct *pick_next_task_dl(struct rq *rq)
>>> +{
>>> + struct task_struct *p;
>>> +
>>> + p = pick_task_dl(rq);
>>> + if (p)
>>> + set_next_task_dl(rq, p, true);
>>> +
>>> return p;
>>> }
>>>
>>> @@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
>>>
>>> #ifdef CONFIG_SMP
>>> .balance = balance_dl,
>>> + .pick_task = pick_task_dl,
>>> .select_task_rq = select_task_rq_dl,
>>> .migrate_task_rq = migrate_task_rq_dl,
>>> .set_cpus_allowed = set_cpus_allowed_dl,
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index dbd9368a959d..bd6aed63f5e3 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>> * Avoid running the skip buddy, if running something else can
>>> * be done without getting too unfair.
>>> */
>>> - if (cfs_rq->skip == se) {
>>> + if (cfs_rq->skip && cfs_rq->skip == se) {
>>> struct sched_entity *second;
>>>
>>> if (se == curr) {
>>> @@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
>>> set_last_buddy(se);
>>> }
>>>
>>> +#ifdef CONFIG_SMP
>>> +static struct task_struct *pick_task_fair(struct rq *rq)
>>> +{
>>> + struct cfs_rq *cfs_rq = &rq->cfs;
>>> + struct sched_entity *se;
>>> +
>>> + if (!cfs_rq->nr_running)
>>> + return NULL;
>>> +
>>> + do {
>>> + struct sched_entity *curr = cfs_rq->curr;
>>> +
>>> + se = pick_next_entity(cfs_rq, NULL);
>>> +
>>> + if (curr) {
>>> + if (se && curr->on_rq)
>>> + update_curr(cfs_rq);
>>> +
>>> + if (!se || entity_before(curr, se))
>>> + se = curr;
>>> + }
>>> +
>>> + cfs_rq = group_cfs_rq(se);
>>> + } while (cfs_rq);
>>> ++
>>> + return task_of(se);
>>> +}
>>> +#endif
>>
>> One of my machines hangs when I run uperf with only one message:
>> [ 719.034962] BUG: kernel NULL pointer dereference, address: 0000000000000050
>>
>> Then I replicated the problem on my another machine(no serial console),
>> here is the stack by manual copy.
>>
>> Call Trace:
>> pick_next_entity+0xb0/0x160
>> pick_task_fair+0x4b/0x90
>> __schedule+0x59b/0x12f0
>> schedule_idle+0x1e/0x40
>> do_idle+0x193/0x2d0
>> cpu_startup_entry+0x19/0x20
>> start_secondary+0x110/0x150
>> secondary_startup_64_no_verify+0xa6/0xab
>
> Interesting. Wondering if we screwed something up in the rebase.
>
> Questions:
> 1. Does the issue happen if you just apply only up until this patch,
> or the entire series?
I applied the entire series and just find a related patch to report the
issue.
> 2. Do you see the issue in v7? Not much if at all has changed in this
> part of the code from v7 -> v8 but could be something in the newer
> kernel.
>
IIRC, I can run uperf successfully on v7.
I'm on tip/master 2d3e8c9424c9 (origin/master) "Merge branch 'linus'."
Please let me know if this is a problem, or you have a repo I can pull
for testing.
> We tested this series after rebase heavily so it is indeed strange to
> see this so late.
Cc Hongyu - Maybe we can run the test cases in our hand before next release.
Thanks,
-Aubrey
On Fri, Oct 23, 2020 at 01:25:38PM +0800, Li, Aubrey wrote:
> >>> @@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
> >>>
> >>> #ifdef CONFIG_SMP
> >>> .balance = balance_dl,
> >>> + .pick_task = pick_task_dl,
> >>> .select_task_rq = select_task_rq_dl,
> >>> .migrate_task_rq = migrate_task_rq_dl,
> >>> .set_cpus_allowed = set_cpus_allowed_dl,
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index dbd9368a959d..bd6aed63f5e3 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> >>> * Avoid running the skip buddy, if running something else can
> >>> * be done without getting too unfair.
> >>> */
> >>> - if (cfs_rq->skip == se) {
> >>> + if (cfs_rq->skip && cfs_rq->skip == se) {
> >>> struct sched_entity *second;
> >>>
> >>> if (se == curr) {
> >>> @@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> >>> set_last_buddy(se);
> >>> }
> >>>
> >>> +#ifdef CONFIG_SMP
> >>> +static struct task_struct *pick_task_fair(struct rq *rq)
> >>> +{
> >>> + struct cfs_rq *cfs_rq = &rq->cfs;
> >>> + struct sched_entity *se;
> >>> +
> >>> + if (!cfs_rq->nr_running)
> >>> + return NULL;
> >>> +
> >>> + do {
> >>> + struct sched_entity *curr = cfs_rq->curr;
> >>> +
> >>> + se = pick_next_entity(cfs_rq, NULL);
> >>> +
> >>> + if (curr) {
> >>> + if (se && curr->on_rq)
> >>> + update_curr(cfs_rq);
> >>> +
> >>> + if (!se || entity_before(curr, se))
> >>> + se = curr;
> >>> + }
> >>> +
> >>> + cfs_rq = group_cfs_rq(se);
> >>> + } while (cfs_rq);
> >>> ++
> >>> + return task_of(se);
> >>> +}
> >>> +#endif
> >>
> >> One of my machines hangs when I run uperf with only one message:
> >> [ 719.034962] BUG: kernel NULL pointer dereference, address: 0000000000000050
> >>
> >> Then I replicated the problem on my another machine(no serial console),
> >> here is the stack by manual copy.
> >>
> >> Call Trace:
> >> pick_next_entity+0xb0/0x160
> >> pick_task_fair+0x4b/0x90
> >> __schedule+0x59b/0x12f0
> >> schedule_idle+0x1e/0x40
> >> do_idle+0x193/0x2d0
> >> cpu_startup_entry+0x19/0x20
> >> start_secondary+0x110/0x150
> >> secondary_startup_64_no_verify+0xa6/0xab
> >
> > Interesting. Wondering if we screwed something up in the rebase.
> >
> > Questions:
> > 1. Does the issue happen if you just apply only up until this patch,
> > or the entire series?
>
> I applied the entire series and just find a related patch to report the
> issue.
Ok.
> > 2. Do you see the issue in v7? Not much if at all has changed in this
> > part of the code from v7 -> v8 but could be something in the newer
> > kernel.
> >
>
> IIRC, I can run uperf successfully on v7.
> I'm on tip/master 2d3e8c9424c9 (origin/master) "Merge branch 'linus'."
> Please let me know if this is a problem, or you have a repo I can pull
> for testing.
Here is a repo with v8 series on top of v5.9 release:
https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/log/?h=coresched-v5.9
thanks,
- Joel
On 2020/10/24 5:47, Joel Fernandes wrote:
> On Fri, Oct 23, 2020 at 01:25:38PM +0800, Li, Aubrey wrote:
>>>>> @@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
>>>>>
>>>>> #ifdef CONFIG_SMP
>>>>> .balance = balance_dl,
>>>>> + .pick_task = pick_task_dl,
>>>>> .select_task_rq = select_task_rq_dl,
>>>>> .migrate_task_rq = migrate_task_rq_dl,
>>>>> .set_cpus_allowed = set_cpus_allowed_dl,
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index dbd9368a959d..bd6aed63f5e3 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>>>> * Avoid running the skip buddy, if running something else can
>>>>> * be done without getting too unfair.
>>>>> */
>>>>> - if (cfs_rq->skip == se) {
>>>>> + if (cfs_rq->skip && cfs_rq->skip == se) {
>>>>> struct sched_entity *second;
>>>>>
>>>>> if (se == curr) {
>>>>> @@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
>>>>> set_last_buddy(se);
>>>>> }
>>>>>
>>>>> +#ifdef CONFIG_SMP
>>>>> +static struct task_struct *pick_task_fair(struct rq *rq)
>>>>> +{
>>>>> + struct cfs_rq *cfs_rq = &rq->cfs;
>>>>> + struct sched_entity *se;
>>>>> +
>>>>> + if (!cfs_rq->nr_running)
>>>>> + return NULL;
>>>>> +
>>>>> + do {
>>>>> + struct sched_entity *curr = cfs_rq->curr;
>>>>> +
>>>>> + se = pick_next_entity(cfs_rq, NULL);
>>>>> +
>>>>> + if (curr) {
>>>>> + if (se && curr->on_rq)
>>>>> + update_curr(cfs_rq);
>>>>> +
>>>>> + if (!se || entity_before(curr, se))
>>>>> + se = curr;
>>>>> + }
>>>>> +
>>>>> + cfs_rq = group_cfs_rq(se);
>>>>> + } while (cfs_rq);
>>>>> ++
>>>>> + return task_of(se);
>>>>> +}
>>>>> +#endif
>>>>
>>>> One of my machines hangs when I run uperf with only one message:
>>>> [ 719.034962] BUG: kernel NULL pointer dereference, address: 0000000000000050
>>>>
>>>> Then I replicated the problem on my another machine(no serial console),
>>>> here is the stack by manual copy.
>>>>
>>>> Call Trace:
>>>> pick_next_entity+0xb0/0x160
>>>> pick_task_fair+0x4b/0x90
>>>> __schedule+0x59b/0x12f0
>>>> schedule_idle+0x1e/0x40
>>>> do_idle+0x193/0x2d0
>>>> cpu_startup_entry+0x19/0x20
>>>> start_secondary+0x110/0x150
>>>> secondary_startup_64_no_verify+0xa6/0xab
>>>
>>> Interesting. Wondering if we screwed something up in the rebase.
>>>
>>> Questions:
>>> 1. Does the issue happen if you just apply only up until this patch,
>>> or the entire series?
>>
>> I applied the entire series and just find a related patch to report the
>> issue.
>
> Ok.
>
>>> 2. Do you see the issue in v7? Not much if at all has changed in this
>>> part of the code from v7 -> v8 but could be something in the newer
>>> kernel.
>>>
>>
>> IIRC, I can run uperf successfully on v7.
>> I'm on tip/master 2d3e8c9424c9 (origin/master) "Merge branch 'linus'."
>> Please let me know if this is a problem, or you have a repo I can pull
>> for testing.
>
> Here is a repo with v8 series on top of v5.9 release:
> https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/log/?h=coresched-v5.9
I didn't see NULL pointer dereference BUG of this repo, will post performance
data later.
Thanks,
-Aubrey
On 10/24/20 7:10 AM, Vineeth Pillai wrote:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 93a3b874077d..4cae5ac48b60 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *curr)
> se = second;
> }
>
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left)
> < 1) {
> + if (left && cfs_rq->next &&
> + wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> /*
> * Someone really wants this to run. If it's not
> unfair, run it.
> */
> se = cfs_rq->next;
> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
> left) < 1) {
> + } else if (left && cfs_rq->last &&
> + wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> /*
> * Prefer last buddy, try to return the CPU to a
> preempted task.
>
>
> There reason for left being NULL needs to be investigated. This was
> there from v1 and we did not yet get to it. I shall try to debug later
> this week.
Thinking more about it and looking at the crash, I think that
'left == NULL' can happen in pick_next_entity for core scheduling.
If a cfs_rq has only one task that is running, then it will be
dequeued and 'left = __pick_first_entity()' will be NULL as the
cfs_rq will be empty. This would not happen outside of coresched
because we never call pick_tack() before put_prev_task() which
will enqueue the task back.
With core scheduling, a cpu can call pick_task() for its sibling while
the sibling is still running the active task and put_prev_task has yet
not been called. This can result in 'left == NULL'. So I think the
above fix is appropriate when core scheduling is active. It could be
cleaned up a bit though.
Thanks,
Vineeth
Hi Aubrey,
On 10/23/20 10:48 PM, Li, Aubrey wrote:
>
>>>> 2. Do you see the issue in v7? Not much if at all has changed in this
>>>> part of the code from v7 -> v8 but could be something in the newer
>>>> kernel.
>>>>
>>> IIRC, I can run uperf successfully on v7.
>>> I'm on tip/master 2d3e8c9424c9 (origin/master) "Merge branch 'linus'."
>>> Please let me know if this is a problem, or you have a repo I can pull
>>> for testing.
>> Here is a repo with v8 series on top of v5.9 release:
>> https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/log/?h=coresched-v5.9
> I didn't see NULL pointer dereference BUG of this repo, will post performance
> data later.
There has been a change in tip in pick_next_entity which caused
removal of a coresched related change to fix the crash. Could
you please try this patch on top of the posted v8 and see if this
works?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 93a3b874077d..4cae5ac48b60 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ if (left && cfs_rq->next &&
+ wakeup_preempt_entity(cfs_rq->next, left) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+ } else if (left && cfs_rq->last &&
+ wakeup_preempt_entity(cfs_rq->last, left) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
There reason for left being NULL needs to be investigated. This was
there from v1 and we did not yet get to it. I shall try to debug later
this week.
Kindly test this patch and let us know if it fixes the crash.
Thanks,
Vineeth
On 2020/10/24 20:27, Vineeth Pillai wrote:
>
>
> On 10/24/20 7:10 AM, Vineeth Pillai wrote:
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 93a3b874077d..4cae5ac48b60 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>> se = second;
>> }
>>
>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>> + if (left && cfs_rq->next &&
>> + wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>> /*
>> * Someone really wants this to run. If it's not unfair, run it.
>> */
>> se = cfs_rq->next;
>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>> + } else if (left && cfs_rq->last &&
>> + wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>> /*
>> * Prefer last buddy, try to return the CPU to a preempted task.
>>
>>
>> There reason for left being NULL needs to be investigated. This was
>> there from v1 and we did not yet get to it. I shall try to debug later
>> this week.
>
> Thinking more about it and looking at the crash, I think that
> 'left == NULL' can happen in pick_next_entity for core scheduling.
> If a cfs_rq has only one task that is running, then it will be
> dequeued and 'left = __pick_first_entity()' will be NULL as the
> cfs_rq will be empty. This would not happen outside of coresched
> because we never call pick_tack() before put_prev_task() which
> will enqueue the task back.
>
> With core scheduling, a cpu can call pick_task() for its sibling while
> the sibling is still running the active task and put_prev_task has yet
> not been called. This can result in 'left == NULL'. So I think the
> above fix is appropriate when core scheduling is active. It could be
> cleaned up a bit though.
This patch works, thanks Vineeth for the quick fix!
On Sat, Oct 24, 2020 at 08:27:16AM -0400, Vineeth Pillai wrote:
>
>
> On 10/24/20 7:10 AM, Vineeth Pillai wrote:
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 93a3b874077d..4cae5ac48b60 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
> > sched_entity *curr)
> > ??????????????????????? se = second;
> > ??????? }
> >
> > -?????? if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) <
> > 1) {
> > +?????? if (left && cfs_rq->next &&
> > +?????????????????????? wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > ??????????????? /*
> > ???????????????? * Someone really wants this to run. If it's not unfair,
> > run it.
> > ???????????????? */
> > ??????????????? se = cfs_rq->next;
> > -?????? } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
> > left) < 1) {
> > +?????? } else if (left && cfs_rq->last &&
> > +?????????????????????? wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> > ??????????????? /*
> > ???????????????? * Prefer last buddy, try to return the CPU to a
> > preempted task.
> >
> >
> > There reason for left being NULL needs to be investigated. This was
> > there from v1 and we did not yet get to it. I shall try to debug later
> > this week.
>
> Thinking more about it and looking at the crash, I think that
> 'left == NULL' can happen in pick_next_entity for core scheduling.
> If a cfs_rq has only one task that is running, then it will be
> dequeued and 'left = __pick_first_entity()' will be NULL as the
> cfs_rq will be empty. This would not happen outside of coresched
> because we never call pick_tack() before put_prev_task() which
> will enqueue the task back.
>
> With core scheduling, a cpu can call pick_task() for its sibling while
> the sibling is still running the active task and put_prev_task has yet
> not been called. This can result in 'left == NULL'.
Quite correct. Hurmph.. the reason we do this is because... we do the
update_curr() the wrong way around. And I can't seem to remember why we
do that (it was in my original patches).
Something like so seems the obvious thing to do, but I can't seem to
remember why we're not doing it :-(
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6950,15 +6950,10 @@ static struct task_struct *pick_task_fai
do {
struct sched_entity *curr = cfs_rq->curr;
- se = pick_next_entity(cfs_rq, NULL);
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
- if (curr) {
- if (se && curr->on_rq)
- update_curr(cfs_rq);
-
- if (!se || entity_before(curr, se))
- se = curr;
- }
+ se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
On 2020/10/26 17:01, Peter Zijlstra wrote:
> On Sat, Oct 24, 2020 at 08:27:16AM -0400, Vineeth Pillai wrote:
>>
>>
>> On 10/24/20 7:10 AM, Vineeth Pillai wrote:
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 93a3b874077d..4cae5ac48b60 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
>>> sched_entity *curr)
>>> se = second;
>>> }
>>>
>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) <
>>> 1) {
>>> + if (left && cfs_rq->next &&
>>> + wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>> /*
>>> * Someone really wants this to run. If it's not unfair,
>>> run it.
>>> */
>>> se = cfs_rq->next;
>>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
>>> left) < 1) {
>>> + } else if (left && cfs_rq->last &&
>>> + wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>>> /*
>>> * Prefer last buddy, try to return the CPU to a
>>> preempted task.
>>>
>>>
>>> There reason for left being NULL needs to be investigated. This was
>>> there from v1 and we did not yet get to it. I shall try to debug later
>>> this week.
>>
>> Thinking more about it and looking at the crash, I think that
>> 'left == NULL' can happen in pick_next_entity for core scheduling.
>> If a cfs_rq has only one task that is running, then it will be
>> dequeued and 'left = __pick_first_entity()' will be NULL as the
>> cfs_rq will be empty. This would not happen outside of coresched
>> because we never call pick_tack() before put_prev_task() which
>> will enqueue the task back.
>>
>> With core scheduling, a cpu can call pick_task() for its sibling while
>> the sibling is still running the active task and put_prev_task has yet
>> not been called. This can result in 'left == NULL'.
>
> Quite correct. Hurmph.. the reason we do this is because... we do the
> update_curr() the wrong way around. And I can't seem to remember why we
> do that (it was in my original patches).
>
> Something like so seems the obvious thing to do, but I can't seem to
> remember why we're not doing it :-(
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6950,15 +6950,10 @@ static struct task_struct *pick_task_fai
> do {
> struct sched_entity *curr = cfs_rq->curr;
>
> - se = pick_next_entity(cfs_rq, NULL);
> + if (curr && curr->on_rq)
> + update_curr(cfs_rq);
>
> - if (curr) {
> - if (se && curr->on_rq)
> - update_curr(cfs_rq);
> -
> - if (!se || entity_before(curr, se))
> - se = curr;
> - }
> + se = pick_next_entity(cfs_rq, curr);
>
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
This patch works too for my benchmark, thanks Peter!
On Sat, Oct 24, 2020 at 08:27:16AM -0400, Vineeth Pillai wrote:
>
>
> On 10/24/20 7:10 AM, Vineeth Pillai wrote:
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 93a3b874077d..4cae5ac48b60 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
> > sched_entity *curr)
> > ??????????????????????? se = second;
> > ??????? }
> >
> > -?????? if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) <
> > 1) {
> > +?????? if (left && cfs_rq->next &&
> > +?????????????????????? wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > ??????????????? /*
> > ???????????????? * Someone really wants this to run. If it's not unfair,
> > run it.
> > ???????????????? */
> > ??????????????? se = cfs_rq->next;
> > -?????? } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
> > left) < 1) {
> > +?????? } else if (left && cfs_rq->last &&
> > +?????????????????????? wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> > ??????????????? /*
> > ???????????????? * Prefer last buddy, try to return the CPU to a
> > preempted task.
> >
> >
> > There reason for left being NULL needs to be investigated. This was
> > there from v1 and we did not yet get to it. I shall try to debug later
> > this week.
>
> Thinking more about it and looking at the crash, I think that
> 'left == NULL' can happen in pick_next_entity for core scheduling.
> If a cfs_rq has only one task that is running, then it will be
> dequeued and 'left = __pick_first_entity()' will be NULL as the
> cfs_rq will be empty. This would not happen outside of coresched
> because we never call pick_tack() before put_prev_task() which
> will enqueue the task back.
>
> With core scheduling, a cpu can call pick_task() for its sibling while
> the sibling is still running the active task and put_prev_task has yet
> not been called. This can result in 'left == NULL'. So I think the
> above fix is appropriate when core scheduling is active. It could be
> cleaned up a bit though.
Thanks a lot Vineeth!
Just add, the pick_next_entity() returning NULL still causes pick_task_fair()
to behave correctly. Because pick_task_fair() will return 'curr' to the
core-wide picking code, not NULL. The problem is CFS pick_next_entity() is
not able to handle it though and crash, as Vineeth found the below hunk got
lost in the rebase:
@@ -4464,13 +4464,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ if (left && cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;
clear_buddies(cfs_rq, se);
Peter's fix in the other email looks good and I will include that for testing
before the next posting.
thanks,
- Joel
On Tue, Oct 27, 2020 at 10:19:11AM -0400, Joel Fernandes wrote:
> On Mon, Oct 26, 2020 at 10:01:31AM +0100, Peter Zijlstra wrote:
> > On Sat, Oct 24, 2020 at 08:27:16AM -0400, Vineeth Pillai wrote:
> > >
> > >
> > > On 10/24/20 7:10 AM, Vineeth Pillai wrote:
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 93a3b874077d..4cae5ac48b60 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
> > > > sched_entity *curr)
> > > > ??????????????????????? se = second;
> > > > ??????? }
> > > >
> > > > -?????? if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) <
> > > > 1) {
> > > > +?????? if (left && cfs_rq->next &&
> > > > +?????????????????????? wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > > > ??????????????? /*
> > > > ???????????????? * Someone really wants this to run. If it's not unfair,
> > > > run it.
> > > > ???????????????? */
> > > > ??????????????? se = cfs_rq->next;
> > > > -?????? } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
> > > > left) < 1) {
> > > > +?????? } else if (left && cfs_rq->last &&
> > > > +?????????????????????? wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> > > > ??????????????? /*
> > > > ???????????????? * Prefer last buddy, try to return the CPU to a
> > > > preempted task.
> > > >
> > > >
> > > > There reason for left being NULL needs to be investigated. This was
> > > > there from v1 and we did not yet get to it. I shall try to debug later
> > > > this week.
> > >
> > > Thinking more about it and looking at the crash, I think that
> > > 'left == NULL' can happen in pick_next_entity for core scheduling.
> > > If a cfs_rq has only one task that is running, then it will be
> > > dequeued and 'left = __pick_first_entity()' will be NULL as the
> > > cfs_rq will be empty. This would not happen outside of coresched
> > > because we never call pick_tack() before put_prev_task() which
> > > will enqueue the task back.
> > >
> > > With core scheduling, a cpu can call pick_task() for its sibling while
> > > the sibling is still running the active task and put_prev_task has yet
> > > not been called. This can result in 'left == NULL'.
> >
> > Quite correct. Hurmph.. the reason we do this is because... we do the
> > update_curr() the wrong way around. And I can't seem to remember why we
> > do that (it was in my original patches).
> >
> > Something like so seems the obvious thing to do, but I can't seem to
> > remember why we're not doing it :-(
>
> The code below is just a refactor and not a functional change though, right?
>
> i.e. pick_next_entity() is already returning se = curr, if se == NULL.
>
> But the advantage of your refactor is it doesn't crash the kernel.
>
> So your change appears safe to me unless I missed something.
I included it as patch appeneded below for testing, hopefully the commit
message is appropriate.
On a related note, this pattern is very similar to pick_next_task_fair()'s
!simple case. Over there it does check_cfs_rq_runtime() for throttling the
cfs_rq. Should we also be doing that in pick_task_fair() ?
This bit:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
* Therefore the nr_running test will indeed
* be correct.
*/
if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
goto idle;
goto simple;
}
---8<-----------------------
From: Peter Zijlstra <[email protected]>
Subject: [PATCH] sched/fair: Fix pick_task_fair crashes due to empty rbtree
pick_next_entity() is passed curr == NULL during core-scheduling. Due to
this, if the rbtree is empty, the 'left' variable is set to NULL within
the function. This can cause crashes within the function.
This is not an issue if put_prev_task() is invoked on the currently
running task before calling pick_next_entity(). However, in core
scheduling, it is possible that a sibling CPU picks for another RQ in
the core, via pick_task_fair(). This remote sibling would not get any
opportunities to do a put_prev_task().
Fix it by refactoring pick_task_fair() such that pick_next_entity() is
called with the cfs_rq->curr. This will prevent pick_next_entity() from
crashing if its rbtree is empty.
Suggested-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/sched/fair.c | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 93a3b874077d..591859016263 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6975,15 +6975,10 @@ static struct task_struct *pick_task_fair(struct rq *rq)
do {
struct sched_entity *curr = cfs_rq->curr;
- se = pick_next_entity(cfs_rq, NULL);
-
- if (curr) {
- if (se && curr->on_rq)
- update_curr(cfs_rq);
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
- if (!se || entity_before(curr, se))
- se = curr;
- }
+ se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
--
2.29.0.rc2.309.g374f81d7ae-goog
On Mon, Oct 26, 2020 at 10:01:31AM +0100, Peter Zijlstra wrote:
> On Sat, Oct 24, 2020 at 08:27:16AM -0400, Vineeth Pillai wrote:
> >
> >
> > On 10/24/20 7:10 AM, Vineeth Pillai wrote:
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 93a3b874077d..4cae5ac48b60 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4428,12 +4428,14 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct
> > > sched_entity *curr)
> > > ??????????????????????? se = second;
> > > ??????? }
> > >
> > > -?????? if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) <
> > > 1) {
> > > +?????? if (left && cfs_rq->next &&
> > > +?????????????????????? wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > > ??????????????? /*
> > > ???????????????? * Someone really wants this to run. If it's not unfair,
> > > run it.
> > > ???????????????? */
> > > ??????????????? se = cfs_rq->next;
> > > -?????? } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last,
> > > left) < 1) {
> > > +?????? } else if (left && cfs_rq->last &&
> > > +?????????????????????? wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> > > ??????????????? /*
> > > ???????????????? * Prefer last buddy, try to return the CPU to a
> > > preempted task.
> > >
> > >
> > > There reason for left being NULL needs to be investigated. This was
> > > there from v1 and we did not yet get to it. I shall try to debug later
> > > this week.
> >
> > Thinking more about it and looking at the crash, I think that
> > 'left == NULL' can happen in pick_next_entity for core scheduling.
> > If a cfs_rq has only one task that is running, then it will be
> > dequeued and 'left = __pick_first_entity()' will be NULL as the
> > cfs_rq will be empty. This would not happen outside of coresched
> > because we never call pick_tack() before put_prev_task() which
> > will enqueue the task back.
> >
> > With core scheduling, a cpu can call pick_task() for its sibling while
> > the sibling is still running the active task and put_prev_task has yet
> > not been called. This can result in 'left == NULL'.
>
> Quite correct. Hurmph.. the reason we do this is because... we do the
> update_curr() the wrong way around. And I can't seem to remember why we
> do that (it was in my original patches).
>
> Something like so seems the obvious thing to do, but I can't seem to
> remember why we're not doing it :-(
The code below is just a refactor and not a functional change though, right?
i.e. pick_next_entity() is already returning se = curr, if se == NULL.
But the advantage of your refactor is it doesn't crash the kernel.
So your change appears safe to me unless I missed something.
thanks,
- Joel
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6950,15 +6950,10 @@ static struct task_struct *pick_task_fai
> do {
> struct sched_entity *curr = cfs_rq->curr;
>
> - se = pick_next_entity(cfs_rq, NULL);
> + if (curr && curr->on_rq)
> + update_curr(cfs_rq);
>
> - if (curr) {
> - if (se && curr->on_rq)
> - update_curr(cfs_rq);
> -
> - if (!se || entity_before(curr, se))
> - se = curr;
> - }
> + se = pick_next_entity(cfs_rq, curr);
>
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);