2020-11-17 23:27:27

by Joel Fernandes

[permalink] [raw]
Subject: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

From: Peter Zijlstra <[email protected]>

Because sched_class::pick_next_task() also implies
sched_class::set_next_task() (and possibly put_prev_task() and
newidle_balance) it is not state invariant. This makes it unsuitable
for remote task selection.

Tested-by: Julien Desfossez <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
Signed-off-by: Joel Fernandes (Google) <[email protected]>
---
kernel/sched/deadline.c | 16 ++++++++++++++--
kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
kernel/sched/idle.c | 8 ++++++++
kernel/sched/rt.c | 15 +++++++++++++--
kernel/sched/sched.h | 3 +++
kernel/sched/stop_task.c | 14 ++++++++++++--
6 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0f2ea0a3664c..abfc8b505d0d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1867,7 +1867,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}

-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
@@ -1879,7 +1879,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+ struct task_struct *p;
+
+ p = pick_task_dl(rq);
+ if (p)
+ set_next_task_dl(rq, p, true);
+
return p;
}

@@ -2551,6 +2562,7 @@ DEFINE_SCHED_CLASS(dl) = {

#ifdef CONFIG_SMP
.balance = balance_dl,
+ .pick_task = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 52ddfec7cea6..12cf068eeec8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4459,7 +4459,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;

if (se == curr) {
@@ -7017,6 +7017,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
set_last_buddy(se);
}

+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ se = pick_next_entity(cfs_rq, NULL);
+
+ if (curr) {
+ if (se && curr->on_rq)
+ update_curr(cfs_rq);
+
+ if (!se || entity_before(curr, se))
+ se = curr;
+ }
+
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+#endif
+
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -11219,6 +11248,7 @@ DEFINE_SCHED_CLASS(fair) = {

#ifdef CONFIG_SMP
.balance = balance_fair,
+ .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 50e128b899c4..33864193a2f9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -406,6 +406,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
schedstat_inc(rq->sched_goidle);
}

+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+#endif
+
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
@@ -473,6 +480,7 @@ DEFINE_SCHED_CLASS(idle) = {

#ifdef CONFIG_SMP
.balance = balance_idle,
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a6f9d132c24f..a0e245b0c4bd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1626,7 +1626,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}

-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;

@@ -1634,7 +1634,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
return NULL;

p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p, true);
+
+ return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+ struct task_struct *p = pick_task_rt(rq);
+
+ if (p)
+ set_next_task_rt(rq, p, true);
+
return p;
}

@@ -2483,6 +2493,7 @@ DEFINE_SCHED_CLASS(rt) = {

#ifdef CONFIG_SMP
.balance = balance_rt,
+ .pick_task = pick_task_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f794c9337047..5a0dd2b312aa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1839,6 +1839,9 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+ struct task_struct * (*pick_task)(struct rq *rq);
+
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

void (*task_woken)(struct rq *this_rq, struct task_struct *task);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 55f39125c0e1..f988ebe3febb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
stop->se.exec_start = rq_clock_task(rq);
}

-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *pick_task_stop(struct rq *rq)
{
if (!sched_stop_runnable(rq))
return NULL;

- set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}

+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+ struct task_struct *p = pick_task_stop(rq);
+
+ if (p)
+ set_next_task_stop(rq, p, true);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -123,6 +132,7 @@ DEFINE_SCHED_CLASS(stop) = {

#ifdef CONFIG_SMP
.balance = balance_stop,
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
--
2.29.2.299.gdc1121823c-goog


2020-11-19 23:59:36

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On 18/11/20 10:19 am, Joel Fernandes (Google) wrote:
> From: Peter Zijlstra <[email protected]>
>
> Because sched_class::pick_next_task() also implies
> sched_class::set_next_task() (and possibly put_prev_task() and
> newidle_balance) it is not state invariant. This makes it unsuitable
> for remote task selection.
>

The change makes sense, a small suggestion below

> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> kernel/sched/deadline.c | 16 ++++++++++++++--
> kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
> kernel/sched/idle.c | 8 ++++++++
> kernel/sched/rt.c | 15 +++++++++++++--
> kernel/sched/sched.h | 3 +++
> kernel/sched/stop_task.c | 14 ++++++++++++--
> 6 files changed, 81 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 0f2ea0a3664c..abfc8b505d0d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1867,7 +1867,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
> return rb_entry(left, struct sched_dl_entity, rb_node);
> }
>
> -static struct task_struct *pick_next_task_dl(struct rq *rq)
> +static struct task_struct *pick_task_dl(struct rq *rq)
> {
> struct sched_dl_entity *dl_se;
> struct dl_rq *dl_rq = &rq->dl;
> @@ -1879,7 +1879,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> dl_se = pick_next_dl_entity(rq, dl_rq);
> BUG_ON(!dl_se);
> p = dl_task_of(dl_se);
> - set_next_task_dl(rq, p, true);
> +
> + return p;
> +}
> +
> +static struct task_struct *pick_next_task_dl(struct rq *rq)
> +{
> + struct task_struct *p;
> +
> + p = pick_task_dl(rq);
> + if (p)
> + set_next_task_dl(rq, p, true);
> +
> return p;
> }
>
> @@ -2551,6 +2562,7 @@ DEFINE_SCHED_CLASS(dl) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_dl,
> + .pick_task = pick_task_dl,
> .select_task_rq = select_task_rq_dl,
> .migrate_task_rq = migrate_task_rq_dl,
> .set_cpus_allowed = set_cpus_allowed_dl,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 52ddfec7cea6..12cf068eeec8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4459,7 +4459,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> * Avoid running the skip buddy, if running something else can
> * be done without getting too unfair.
> */
> - if (cfs_rq->skip == se) {
> + if (cfs_rq->skip && cfs_rq->skip == se) {
> struct sched_entity *second;
>
> if (se == curr) {
> @@ -7017,6 +7017,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> set_last_buddy(se);
> }
>
> +#ifdef CONFIG_SMP
> +static struct task_struct *pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = &rq->cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);
> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +
> + if (!se || entity_before(curr, se))
> + se = curr;
> + }

Do we want to optimize this a bit

if (curr) {
if (!se || entity_before(curr, se))
se = curr;

if ((se != curr) && curr->on_rq)
update_curr(cfs_rq);

}

Balbir

2020-11-20 17:03:18

by Joel Fernandes

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Fri, Nov 20, 2020 at 10:56:09AM +1100, Singh, Balbir wrote:
[..]
> > +#ifdef CONFIG_SMP
> > +static struct task_struct *pick_task_fair(struct rq *rq)
> > +{
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + struct sched_entity *se;
> > +
> > + if (!cfs_rq->nr_running)
> > + return NULL;
> > +
> > + do {
> > + struct sched_entity *curr = cfs_rq->curr;
> > +
> > + se = pick_next_entity(cfs_rq, NULL);
> > +
> > + if (curr) {
> > + if (se && curr->on_rq)
> > + update_curr(cfs_rq);
> > +
> > + if (!se || entity_before(curr, se))
> > + se = curr;
> > + }
>
> Do we want to optimize this a bit
>
> if (curr) {
> if (!se || entity_before(curr, se))
> se = curr;
>
> if ((se != curr) && curr->on_rq)
> update_curr(cfs_rq);
>
> }

Do you see a difference in codegen? What's the optimization?

Also in later patches this code changes, so it should not matter:
See: 3e0838fa3c51 ("sched/fair: Fix pick_task_fair crashes due to empty rbtree")

thanks,

- Joel

2020-11-25 16:32:45

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Wed, 18 Nov 2020 at 00:20, Joel Fernandes (Google)
<[email protected]> wrote:
>
> From: Peter Zijlstra <[email protected]>
>
> Because sched_class::pick_next_task() also implies
> sched_class::set_next_task() (and possibly put_prev_task() and
> newidle_balance) it is not state invariant. This makes it unsuitable
> for remote task selection.
>
> Tested-by: Julien Desfossez <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> Signed-off-by: Joel Fernandes (Google) <[email protected]>
> ---
> kernel/sched/deadline.c | 16 ++++++++++++++--
> kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++-
> kernel/sched/idle.c | 8 ++++++++
> kernel/sched/rt.c | 15 +++++++++++++--
> kernel/sched/sched.h | 3 +++
> kernel/sched/stop_task.c | 14 ++++++++++++--
> 6 files changed, 81 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 0f2ea0a3664c..abfc8b505d0d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1867,7 +1867,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
> return rb_entry(left, struct sched_dl_entity, rb_node);
> }
>
> -static struct task_struct *pick_next_task_dl(struct rq *rq)
> +static struct task_struct *pick_task_dl(struct rq *rq)
> {
> struct sched_dl_entity *dl_se;
> struct dl_rq *dl_rq = &rq->dl;
> @@ -1879,7 +1879,18 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
> dl_se = pick_next_dl_entity(rq, dl_rq);
> BUG_ON(!dl_se);
> p = dl_task_of(dl_se);
> - set_next_task_dl(rq, p, true);
> +
> + return p;
> +}
> +
> +static struct task_struct *pick_next_task_dl(struct rq *rq)
> +{
> + struct task_struct *p;
> +
> + p = pick_task_dl(rq);
> + if (p)
> + set_next_task_dl(rq, p, true);
> +
> return p;
> }
>
> @@ -2551,6 +2562,7 @@ DEFINE_SCHED_CLASS(dl) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_dl,
> + .pick_task = pick_task_dl,
> .select_task_rq = select_task_rq_dl,
> .migrate_task_rq = migrate_task_rq_dl,
> .set_cpus_allowed = set_cpus_allowed_dl,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 52ddfec7cea6..12cf068eeec8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4459,7 +4459,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> * Avoid running the skip buddy, if running something else can
> * be done without getting too unfair.
> */
> - if (cfs_rq->skip == se) {
> + if (cfs_rq->skip && cfs_rq->skip == se) {
> struct sched_entity *second;
>
> if (se == curr) {
> @@ -7017,6 +7017,35 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> set_last_buddy(se);
> }
>
> +#ifdef CONFIG_SMP
> +static struct task_struct *pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = &rq->cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);

Calling pick_next_entity clears buddies. This is fine without
coresched because the se will be the next one. But calling
pick_task_fair doesn't mean that the se will be used

> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +

Shouldn't you check if cfs_rq is throttled ?

> + if (!se || entity_before(curr, se))
> + se = curr;
> + }
> +
> + cfs_rq = group_cfs_rq(se);
> + } while (cfs_rq);
> +
> + return task_of(se);
> +}
> +#endif
> +
> struct task_struct *
> pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> @@ -11219,6 +11248,7 @@ DEFINE_SCHED_CLASS(fair) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_fair,
> + .pick_task = pick_task_fair,
> .select_task_rq = select_task_rq_fair,
> .migrate_task_rq = migrate_task_rq_fair,
>
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 50e128b899c4..33864193a2f9 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -406,6 +406,13 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
> schedstat_inc(rq->sched_goidle);
> }
>
> +#ifdef CONFIG_SMP
> +static struct task_struct *pick_task_idle(struct rq *rq)
> +{
> + return rq->idle;
> +}
> +#endif
> +
> struct task_struct *pick_next_task_idle(struct rq *rq)
> {
> struct task_struct *next = rq->idle;
> @@ -473,6 +480,7 @@ DEFINE_SCHED_CLASS(idle) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_idle,
> + .pick_task = pick_task_idle,
> .select_task_rq = select_task_rq_idle,
> .set_cpus_allowed = set_cpus_allowed_common,
> #endif
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index a6f9d132c24f..a0e245b0c4bd 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1626,7 +1626,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
> return rt_task_of(rt_se);
> }
>
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *pick_task_rt(struct rq *rq)
> {
> struct task_struct *p;
>
> @@ -1634,7 +1634,17 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
> return NULL;
>
> p = _pick_next_task_rt(rq);
> - set_next_task_rt(rq, p, true);
> +
> + return p;
> +}
> +
> +static struct task_struct *pick_next_task_rt(struct rq *rq)
> +{
> + struct task_struct *p = pick_task_rt(rq);
> +
> + if (p)
> + set_next_task_rt(rq, p, true);
> +
> return p;
> }
>
> @@ -2483,6 +2493,7 @@ DEFINE_SCHED_CLASS(rt) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_rt,
> + .pick_task = pick_task_rt,
> .select_task_rq = select_task_rq_rt,
> .set_cpus_allowed = set_cpus_allowed_common,
> .rq_online = rq_online_rt,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index f794c9337047..5a0dd2b312aa 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1839,6 +1839,9 @@ struct sched_class {
> #ifdef CONFIG_SMP
> int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
> int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
> +
> + struct task_struct * (*pick_task)(struct rq *rq);
> +
> void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
>
> void (*task_woken)(struct rq *this_rq, struct task_struct *task);
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index 55f39125c0e1..f988ebe3febb 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -34,15 +34,24 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir
> stop->se.exec_start = rq_clock_task(rq);
> }
>
> -static struct task_struct *pick_next_task_stop(struct rq *rq)
> +static struct task_struct *pick_task_stop(struct rq *rq)
> {
> if (!sched_stop_runnable(rq))
> return NULL;
>
> - set_next_task_stop(rq, rq->stop, true);
> return rq->stop;
> }
>
> +static struct task_struct *pick_next_task_stop(struct rq *rq)
> +{
> + struct task_struct *p = pick_task_stop(rq);
> +
> + if (p)
> + set_next_task_stop(rq, p, true);
> +
> + return p;
> +}
> +
> static void
> enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
> {
> @@ -123,6 +132,7 @@ DEFINE_SCHED_CLASS(stop) = {
>
> #ifdef CONFIG_SMP
> .balance = balance_stop,
> + .pick_task = pick_task_stop,
> .select_task_rq = select_task_rq_stop,
> .set_cpus_allowed = set_cpus_allowed_common,
> #endif
> --
> 2.29.2.299.gdc1121823c-goog
>

2020-11-26 11:09:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Wed, Nov 25, 2020 at 05:28:36PM +0100, Vincent Guittot wrote:
> On Wed, 18 Nov 2020 at 00:20, Joel Fernandes (Google)

> > +#ifdef CONFIG_SMP
> > +static struct task_struct *pick_task_fair(struct rq *rq)
> > +{
> > + struct cfs_rq *cfs_rq = &rq->cfs;
> > + struct sched_entity *se;
> > +
> > + if (!cfs_rq->nr_running)
> > + return NULL;
> > +
> > + do {
> > + struct sched_entity *curr = cfs_rq->curr;
> > +
> > + se = pick_next_entity(cfs_rq, NULL);
>
> Calling pick_next_entity clears buddies. This is fine without
> coresched because the se will be the next one. But calling
> pick_task_fair doesn't mean that the se will be used

Urgh, nice one :/

> > +
> > + if (curr) {
> > + if (se && curr->on_rq)
> > + update_curr(cfs_rq);
> > +
>
> Shouldn't you check if cfs_rq is throttled ?

Hmm,... I suppose we do.

> > + if (!se || entity_before(curr, se))
> > + se = curr;
> > + }
> > +
> > + cfs_rq = group_cfs_rq(se);
> > + } while (cfs_rq);
> > +
> > + return task_of(se);
> > +}
> > +#endif

Something like so then?

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4354,6 +4354,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq
static void
set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ clear_buddies(cfs_rq, se);
+
/* 'current' is not kept within the tree. */
if (se->on_rq) {
/*
@@ -4440,8 +4442,6 @@ pick_next_entity(struct cfs_rq *cfs_rq,
se = cfs_rq->last;
}

- clear_buddies(cfs_rq, se);
-
return se;
}

@@ -6982,20 +6982,29 @@ static void check_preempt_wakeup(struct
#ifdef CONFIG_SMP
static struct task_struct *pick_task_fair(struct rq *rq)
{
- struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
-
+ struct cfs_rq *cfs_rq;
+
+again:
+ cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
return NULL;

do {
struct sched_entity *curr = cfs_rq->curr;

- if (curr && curr->on_rq)
- update_curr(cfs_rq);
+ /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;

- se = pick_next_entity(cfs_rq, curr);
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
+ }

+ se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

2020-11-26 18:56:48

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Fri, Nov 20, 2020 at 11:58:54AM -0500, Joel Fernandes wrote:
> On Fri, Nov 20, 2020 at 10:56:09AM +1100, Singh, Balbir wrote:
> [..]
> > > +#ifdef CONFIG_SMP
> > > +static struct task_struct *pick_task_fair(struct rq *rq)
> > > +{
> > > + struct cfs_rq *cfs_rq = &rq->cfs;
> > > + struct sched_entity *se;
> > > +
> > > + if (!cfs_rq->nr_running)
> > > + return NULL;
> > > +
> > > + do {
> > > + struct sched_entity *curr = cfs_rq->curr;
> > > +
> > > + se = pick_next_entity(cfs_rq, NULL);
> > > +
> > > + if (curr) {
> > > + if (se && curr->on_rq)
> > > + update_curr(cfs_rq);
> > > +
> > > + if (!se || entity_before(curr, se))
> > > + se = curr;
> > > + }
> >
> > Do we want to optimize this a bit
> >
> > if (curr) {
> > if (!se || entity_before(curr, se))
> > se = curr;
> >
> > if ((se != curr) && curr->on_rq)
> > update_curr(cfs_rq);
> >
> > }
>
> Do you see a difference in codegen? What's the optimization?
>
> Also in later patches this code changes, so it should not matter:
> See: 3e0838fa3c51 ("sched/fair: Fix pick_task_fair crashes due to empty rbtree")
>

I did see the next patch, but the idea is that we don't need to
update_curr() if the picked sched entity is the same as current (se ==
curr). What are the side-effects of not updating curr when se == curr?

Balbir

2020-11-27 06:29:49

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Thu, 26 Nov 2020 at 10:07, Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Nov 25, 2020 at 05:28:36PM +0100, Vincent Guittot wrote:
> > On Wed, 18 Nov 2020 at 00:20, Joel Fernandes (Google)
>
> > > +#ifdef CONFIG_SMP
> > > +static struct task_struct *pick_task_fair(struct rq *rq)
> > > +{
> > > + struct cfs_rq *cfs_rq = &rq->cfs;
> > > + struct sched_entity *se;
> > > +
> > > + if (!cfs_rq->nr_running)
> > > + return NULL;
> > > +
> > > + do {
> > > + struct sched_entity *curr = cfs_rq->curr;
> > > +
> > > + se = pick_next_entity(cfs_rq, NULL);
> >
> > Calling pick_next_entity clears buddies. This is fine without
> > coresched because the se will be the next one. But calling
> > pick_task_fair doesn't mean that the se will be used
>
> Urgh, nice one :/
>
> > > +
> > > + if (curr) {
> > > + if (se && curr->on_rq)
> > > + update_curr(cfs_rq);
> > > +
> >
> > Shouldn't you check if cfs_rq is throttled ?
>
> Hmm,... I suppose we do.
>
> > > + if (!se || entity_before(curr, se))
> > > + se = curr;
> > > + }
> > > +
> > > + cfs_rq = group_cfs_rq(se);
> > > + } while (cfs_rq);
> > > +
> > > + return task_of(se);
> > > +}
> > > +#endif
>
> Something like so then?

yes. it seems ok

>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4354,6 +4354,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq
> static void
> set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> + clear_buddies(cfs_rq, se);
> +
> /* 'current' is not kept within the tree. */
> if (se->on_rq) {
> /*
> @@ -4440,8 +4442,6 @@ pick_next_entity(struct cfs_rq *cfs_rq,
> se = cfs_rq->last;
> }
>
> - clear_buddies(cfs_rq, se);
> -
> return se;
> }
>
> @@ -6982,20 +6982,29 @@ static void check_preempt_wakeup(struct
> #ifdef CONFIG_SMP
> static struct task_struct *pick_task_fair(struct rq *rq)
> {
> - struct cfs_rq *cfs_rq = &rq->cfs;
> struct sched_entity *se;
> -
> + struct cfs_rq *cfs_rq;
> +
> +again:
> + cfs_rq = &rq->cfs;
> if (!cfs_rq->nr_running)
> return NULL;
>
> do {
> struct sched_entity *curr = cfs_rq->curr;
>
> - if (curr && curr->on_rq)
> - update_curr(cfs_rq);
> + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
> + if (curr) {
> + if (curr->on_rq)
> + update_curr(cfs_rq);
> + else
> + curr = NULL;
>
> - se = pick_next_entity(cfs_rq, curr);
> + if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> + goto again;
> + }
>
> + se = pick_next_entity(cfs_rq, curr);
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>

2020-11-27 08:25:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -tip 02/32] sched: Introduce sched_class::pick_task()

On Thu, Nov 26, 2020 at 11:17:48AM +0100, Vincent Guittot wrote:

> > Something like so then?
>
> yes. it seems ok
>
> >
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c

> > @@ -6982,20 +6982,29 @@ static void check_preempt_wakeup(struct
> > #ifdef CONFIG_SMP
> > static struct task_struct *pick_task_fair(struct rq *rq)
> > {
> > struct sched_entity *se;
> > + struct cfs_rq *cfs_rq;
> > +
> > +again:
> > + cfs_rq = &rq->cfs;
> > if (!cfs_rq->nr_running)
> > return NULL;
> >
> > do {
> > struct sched_entity *curr = cfs_rq->curr;
> >
> > + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
> > + if (curr) {
> > + if (curr->on_rq)
> > + update_curr(cfs_rq);
> > + else
> > + curr = NULL;
> >
> > + if (unlikely(check_cfs_rq_runtime(cfs_rq)))
> > + goto again;

Head-ache though; pick_task() was supposed to be stateless, but now
we're modifying a remote runqueue... I suppose it still works, because
irrespective of which task we end up picking (even idle), we'll schedule
the remote CPU, which would've resulted in the same (and possibly
triggered a reschedule if we'd not done it here).

There's a wrinkle through, other than in schedule(), where we dequeue()
and keep running with the current task while we release rq->lock, this
has preemption enabled as well.

This means that if we do this, the remote CPU could preempt, but the
task is then no longer on the runqueue.

I _think_ it all still works, but yuck!

> > + }
> >
> > + se = pick_next_entity(cfs_rq, curr);
> > cfs_rq = group_cfs_rq(se);
> > } while (cfs_rq);
> >