2023-11-15 11:34:22

by Cruz Zhao

[permalink] [raw]
Subject: [PATCH 0/4] sched/core: fix cfs_prio_less

The update of vruntime snapshot will cause unfair sched, especially when
tasks enqueue/dequeue frequently.

Consider the following case:
- Task A1 and A2 share a cookie, and task B has another cookie.
- A1 is a short task, waking up frequently but running short everytime.
- A2 and B are long tasks.
- A1 and B runs on ht0 and A2 runs on ht1.

ht0 ht1 fi_before fi update
switch to A1 switch to A2 0 0 1
A1 sleeps
switch to B A2 force idle 0 1 1
A1 wakes up
switch to A1 switch to A1 1 0 1
A1 sleeps
switch to B A2 force idle 0 1 1

In this case, cfs_rq->min_vruntime_fi will update every schedule, and
prio of B and A2 will be pulled to the same level, no matter how long A2
and B have run before, which is not fair enough. Extramely, we observed
that the latency of a task became several minutes due to this reason,
which should be 100ms.

To fix this problem, a possible approach is to maintain another vruntime
relative to the core, called core_vruntime, and we compare the priority
of ses using core_vruntime directly, instead of vruntime snapshot. To
achieve this goal, we need to introduce cfs_rq->core, similarity to
rq->core, and record core_min_vruntime in cfs_rq->core.

Cruz Zhao (4):
sched/core: Introduce core_id
sched: Introduce cfs_rq->core
sched: introduce core_vruntime and core_min_vruntime
fix vruntime snapshot

include/linux/sched.h | 3 ++
kernel/sched/core.c | 37 +++++++---------
kernel/sched/fair.c | 98 ++++++++++++++++++++++++++-----------------
kernel/sched/sched.h | 5 ++-
4 files changed, 81 insertions(+), 62 deletions(-)

--
2.39.3


2023-11-15 11:34:28

by Cruz Zhao

[permalink] [raw]
Subject: [PATCH 1/4] sched/core: introduce core_id to struct rq

Introduce core_id to struct rq, indates the cpu id of the core, which
is used for getting cpu id of rq->core quickly.

Signed-off-by: Cruz Zhao <[email protected]>
---
kernel/sched/core.c | 16 ++++++++++++----
kernel/sched/sched.h | 1 +
2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..7a685fae73c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6400,7 +6400,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- int t;
+ int t, core_id;

guard(core_lock)(&cpu);

@@ -6417,6 +6417,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
rq = cpu_rq(t);
if (rq->core == rq) {
core_rq = rq;
+ core_id = t;
break;
}
}
@@ -6428,8 +6429,10 @@ static void sched_core_cpu_starting(unsigned int cpu)
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);

- if (t == cpu)
+ if (t == cpu) {
rq->core = core_rq;
+ rq->core_id = core_id;
+ }

WARN_ON_ONCE(rq->core != core_rq);
}
@@ -6439,7 +6442,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
- int t;
+ int t, core_id;

guard(core_lock)(&cpu);

@@ -6458,6 +6461,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
if (t == cpu)
continue;
core_rq = cpu_rq(t);
+ core_id = t;
break;
}

@@ -6483,6 +6487,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
for_each_cpu(t, smt_mask) {
rq = cpu_rq(t);
rq->core = core_rq;
+ rq->core_id = core_id;
}
}

@@ -6490,8 +6495,10 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);

- if (rq->core != rq)
+ if (rq->core != rq) {
rq->core = rq;
+ rq->core_id = cpu;
+ }
}

#else /* !CONFIG_SCHED_CORE */
@@ -10008,6 +10015,7 @@ void __init sched_init(void)

#ifdef CONFIG_SCHED_CORE
rq->core = rq;
+ rq->core_id = i;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a42..1b62165fc840 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1121,6 +1121,7 @@ struct rq {
#ifdef CONFIG_SCHED_CORE
/* per rq */
struct rq *core;
+ unsigned int core_id;
struct task_struct *core_pick;
unsigned int core_enabled;
unsigned int core_sched_seq;
--
2.39.3

2023-11-15 11:34:44

by Cruz Zhao

[permalink] [raw]
Subject: [PATCH 2/4] sched/core: introduce core to struct cfs_rq

Introduce core to struct cfs_rq, indicates the corresponding cfs_rq of
rq->core.

Signed-off-by: Cruz Zhao <[email protected]>
---
kernel/sched/core.c | 4 ++++
kernel/sched/fair.c | 11 +++++++++++
kernel/sched/sched.h | 1 +
3 files changed, 16 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7a685fae73c4..647a12af9172 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6432,6 +6432,7 @@ static void sched_core_cpu_starting(unsigned int cpu)
if (t == cpu) {
rq->core = core_rq;
rq->core_id = core_id;
+ rq->cfs.core = &core_rq->cfs;
}

WARN_ON_ONCE(rq->core != core_rq);
@@ -6488,6 +6489,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
rq = cpu_rq(t);
rq->core = core_rq;
rq->core_id = core_id;
+ rq->cfs.core = &core_rq->cfs;
}
}

@@ -6498,6 +6500,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
if (rq->core != rq) {
rq->core = rq;
rq->core_id = cpu;
+ rq->cfs.core = &rq->cfs;
}
}

@@ -10016,6 +10019,7 @@ void __init sched_init(void)
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
rq->core_id = i;
+ rq->cfs.core = &rq->cfs;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138ce54b..61cbaa3cc385 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12420,6 +12420,16 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
return delta > 0;
}

+void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq = rq_of(cfs_rq);
+ int core_id = rq->core_id;
+
+ cfs_rq->core = tg->cfs_rq[core_id];
+#endif
+}
+
static int task_is_throttled_fair(struct task_struct *p, int cpu)
{
struct cfs_rq *cfs_rq;
@@ -12715,6 +12725,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ sched_core_init_cfs_rq(tg, cfs_rq);
init_entity_runnable_average(se);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1b62165fc840..62fca54223a1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -547,6 +547,7 @@ struct cfs_rq {
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
u64 min_vruntime_fi;
+ struct cfs_rq *core;
#endif

#ifndef CONFIG_64BIT
--
2.39.3

2023-11-15 11:35:00

by Cruz Zhao

[permalink] [raw]
Subject: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime

To compare the priority of sched_entity from different cpus of a core,
we introduce core_vruntime to struct sched_entity and core_min_vruntime
to struct cfs_rq.

cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
of the same task_group among the core, and se->core_vruntime is the
vruntime relative to se->cfs_rq->core->core_min_vruntime.

Signed-off-by: Cruz Zhao <[email protected]>
---
include/linux/sched.h | 3 +++
kernel/sched/fair.c | 52 ++++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 1 +
3 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292c31697248..df481a8ebc07 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -562,6 +562,9 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
+#ifdef CONFIG_SCHED_CORE
+ u64 core_vruntime;
+#endif
s64 vlag;
u64 slice;

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 61cbaa3cc385..60b2fd437474 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -750,30 +750,58 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
return min_vruntime;
}

+#ifdef CONFIG_SCHED_CORE
+static u64 __update_core_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+{
+ u64 min_vruntime = cfs_rq->core_min_vruntime;
+ s64 delta = (s64)(vruntime - min_vruntime);
+
+ return delta > 0 ? vruntime : min_vruntime;
+}
+#endif
+
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;

u64 vruntime = cfs_rq->min_vruntime;
+#ifdef CONFIG_SCHED_CORE
+ u64 core_vruntime = cfs_rq->core->min_vruntime;
+#endif

if (curr) {
- if (curr->on_rq)
+ if (curr->on_rq) {
vruntime = curr->vruntime;
- else
+#ifdef CONFIG_SCHED_CORE
+ core_vruntime = curr->core_vruntime;
+#endif
+ } else {
curr = NULL;
+ }
}

if (se) {
- if (!curr)
+ if (!curr) {
vruntime = se->vruntime;
- else
+#ifdef CONFIG_SCHED_CORE
+ core_vruntime = se->core_vruntime;
+#endif
+ } else {
vruntime = min_vruntime(vruntime, se->vruntime);
+#ifdef CONFIG_SCHED_CORE
+ core_vruntime = min_vruntime(core_vruntime, se->core_vruntime);
+#endif
+ }
}

/* ensure we never gain time by being placed backwards. */
u64_u32_store(cfs_rq->min_vruntime,
__update_min_vruntime(cfs_rq, vruntime));
+#ifdef CONFIG_SCHED_CORE
+ u64_u32_store(cfs_rq->core->core_min_vruntime,
+ __update_core_min_vruntime(cfs_rq->core, vruntime));
+#endif
}

static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -1137,6 +1165,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct sched_entity *curr = cfs_rq->curr;
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;
+ u64 delta_exec_fair;

if (unlikely(!curr))
return;
@@ -1158,7 +1187,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq->exec_clock, delta_exec);

- curr->vruntime += calc_delta_fair(delta_exec, curr);
+ delta_exec_fair = calc_delta_fair(delta_exec, curr);
+ curr->vruntime += delta_exec_fair;
+#ifdef CONFIG_SCHED_CORE
+ curr->core_vruntime += delta_exec_fair;
+#endif
update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);

@@ -5009,6 +5042,9 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u64 vslice, vruntime = avg_vruntime(cfs_rq);
+#ifdef CONFIG_SCHED_CORE
+ u64 core_vruntime = cfs_rq->core->core_min_vruntime + vruntime - cfs_rq->min_vruntime;
+#endif
s64 lag = 0;

se->slice = sysctl_sched_base_slice;
@@ -5091,6 +5127,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}

se->vruntime = vruntime - lag;
+#ifdef CONFIG_SCHED_CORE
+ se->core_vruntime = core_vruntime - lag;
+#endif

/*
* When joining the competition; the exisiting tasks will be,
@@ -12655,6 +12694,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+#ifdef CONFIG_SCHED_CORE
+ u64_u32_store(cfs_rq->core_min_vruntime, (u64)(-(1LL << 20)));
+#endif
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fca54223a1..f9d3701481f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -545,6 +545,7 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
#ifdef CONFIG_SCHED_CORE
+ u64 core_min_vruntime;
unsigned int forceidle_seq;
u64 min_vruntime_fi;
struct cfs_rq *core;
--
2.39.3

2023-11-15 12:20:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime

On Wed, Nov 15, 2023 at 07:33:40PM +0800, Cruz Zhao wrote:
> To compare the priority of sched_entity from different cpus of a core,
> we introduce core_vruntime to struct sched_entity and core_min_vruntime
> to struct cfs_rq.
>
> cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
> of the same task_group among the core, and se->core_vruntime is the
> vruntime relative to se->cfs_rq->core->core_min_vruntime.

But that makes absolutely no sense. vruntime of different RQs can
advance at wildly different rates. Not to mention there's this random
offset between them.

No, this cannot be.

2023-11-15 13:43:04

by Cruz Zhao

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime



在 2023/11/15 下午8:20, Peter Zijlstra 写道:
> On Wed, Nov 15, 2023 at 07:33:40PM +0800, Cruz Zhao wrote:
>> To compare the priority of sched_entity from different cpus of a core,
>> we introduce core_vruntime to struct sched_entity and core_min_vruntime
>> to struct cfs_rq.
>>
>> cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
>> of the same task_group among the core, and se->core_vruntime is the
>> vruntime relative to se->cfs_rq->core->core_min_vruntime.
>
> But that makes absolutely no sense. vruntime of different RQs can
> advance at wildly different rates. Not to mention there's this random
> offset between them.
>
> No, this cannot be.

Force idle vruntime snapshot does the same thing, comparing
sea->vruntime - cfs_rqa->min_vruntime_fi with seb->vruntime -
cfs_rqb->min_vruntime_fi, while sea and seb may have wildly different rates.

Actually, cfs_rq->core->core_min_vruntime does the same thing as
cfs_rq->min_vruntime_fi, providing a baseline, but
cfs_rq->core->core_min_vruntime is more accurate.

I've tried to implement a fair enough mechanism of core_vruntime, but
it's too complex because of the weight, and it costs a lot. So this is a
compromise solution.

BTW, is there any other solutions to solve this problem?

Best,
Cruz Zhao

2023-11-15 15:23:29

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime

On Wed, Nov 15, 2023 at 09:42:13PM +0800, cruzzhao wrote:
>
>
> 在 2023/11/15 下午8:20, Peter Zijlstra 写道:
> > On Wed, Nov 15, 2023 at 07:33:40PM +0800, Cruz Zhao wrote:
> >> To compare the priority of sched_entity from different cpus of a core,
> >> we introduce core_vruntime to struct sched_entity and core_min_vruntime
> >> to struct cfs_rq.
> >>
> >> cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
> >> of the same task_group among the core, and se->core_vruntime is the
> >> vruntime relative to se->cfs_rq->core->core_min_vruntime.
> >
> > But that makes absolutely no sense. vruntime of different RQs can
> > advance at wildly different rates. Not to mention there's this random
> > offset between them.
> >
> > No, this cannot be.
>
> Force idle vruntime snapshot does the same thing, comparing
> sea->vruntime - cfs_rqa->min_vruntime_fi with seb->vruntime -
> cfs_rqb->min_vruntime_fi, while sea and seb may have wildly different rates.

But that subtracts a from a and b from b, it doesn't mix a and b.

Note that se->vruntime - cfs_rq->min_vruntime is a very poor
approximation of lag. We have actual lag now.

Note that:

(sea - seb) + (min_fib - min_fia) =
(sea - min_fia) + (min_fib - seb) =
(sea - min_fia) - (seb - min_fib) =
'lag'a - 'lag'b

It doesn't mix absolute a and b terms anywhere.

> Actually, cfs_rq->core->core_min_vruntime does the same thing as
> cfs_rq->min_vruntime_fi, providing a baseline, but
> cfs_rq->core->core_min_vruntime is more accurate.

min(cfs_rqa, cfs_rqb) is nonsense. And I can't see how min_vruntime_fi
would do anything like that.

> I've tried to implement a fair enough mechanism of core_vruntime, but
> it's too complex because of the weight, and it costs a lot. So this is a
> compromise solution.

'this' is complete nonsense and not motivated by any math.

> BTW, is there any other solutions to solve this problem?

Well, this is where it all started:

https://lkml.kernel.org/r/20200506143506.GH5298%40hirez.programming.kicks-ass.net

The above lag thing is detailed in a follow up:

https://lkml.kernel.org/r/20200515103844.GG2978%40hirez.programming.kicks-ass.net

Anyway, I think the first of those links has the start of the
multi-queue formalism, see the S_k+l bits. Work that out and see where
it ends.

I did go a bit further, but I've forgotten everything since, it's been
years.

Anyway, nothing like this goes in without a fairly solid bit of math in
the changelog to justify it.

Also, I think Joel complained about something like this at some point,
and he wanted to update the core tree more often, because IIRc his
observation was that things got stale or something.

2023-11-15 20:10:04

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 2/4] sched/core: introduce core to struct cfs_rq

Hi Cruz,

kernel test robot noticed the following build warnings:

[auto build test WARNING on linus/master]
[also build test WARNING on v6.7-rc1 next-20231115]
[cannot apply to tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Cruz-Zhao/sched-core-introduce-core_id-to-struct-rq/20231115-193559
base: linus/master
patch link: https://lore.kernel.org/r/20231115113341.13261-3-CruzZhao%40linux.alibaba.com
patch subject: [PATCH 2/4] sched/core: introduce core to struct cfs_rq
config: x86_64-buildonly-randconfig-003-20231116 (https://download.01.org/0day-ci/archive/20231116/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231116/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

>> kernel/sched/fair.c:12423:6: warning: no previous prototype for 'sched_core_init_cfs_rq' [-Wmissing-prototypes]
12423 | void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq)
| ^~~~~~~~~~~~~~~~~~~~~~


vim +/sched_core_init_cfs_rq +12423 kernel/sched/fair.c

12422
12423 void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq)
12424 {
12425 #ifdef CONFIG_FAIR_GROUP_SCHED
12426 struct rq *rq = rq_of(cfs_rq);
12427 int core_id = rq->core_id;
12428
12429 cfs_rq->core = tg->cfs_rq[core_id];
12430 #endif
12431 }
12432

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-11-15 20:20:51

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 2/4] sched/core: introduce core to struct cfs_rq

Hi Cruz,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.7-rc1 next-20231115]
[cannot apply to tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Cruz-Zhao/sched-core-introduce-core_id-to-struct-rq/20231115-193559
base: linus/master
patch link: https://lore.kernel.org/r/20231115113341.13261-3-CruzZhao%40linux.alibaba.com
patch subject: [PATCH 2/4] sched/core: introduce core to struct cfs_rq
config: x86_64-buildonly-randconfig-004-20231116 (https://download.01.org/0day-ci/archive/20231116/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231116/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

kernel/sched/fair.c: In function 'alloc_fair_sched_group':
>> kernel/sched/fair.c:12728:17: error: implicit declaration of function 'sched_core_init_cfs_rq'; did you mean 'sched_core_idle_cpu'? [-Werror=implicit-function-declaration]
12728 | sched_core_init_cfs_rq(tg, cfs_rq);
| ^~~~~~~~~~~~~~~~~~~~~~
| sched_core_idle_cpu
cc1: some warnings being treated as errors


vim +12728 kernel/sched/fair.c

12697
12698 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
12699 {
12700 struct sched_entity *se;
12701 struct cfs_rq *cfs_rq;
12702 int i;
12703
12704 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
12705 if (!tg->cfs_rq)
12706 goto err;
12707 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
12708 if (!tg->se)
12709 goto err;
12710
12711 tg->shares = NICE_0_LOAD;
12712
12713 init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
12714
12715 for_each_possible_cpu(i) {
12716 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
12717 GFP_KERNEL, cpu_to_node(i));
12718 if (!cfs_rq)
12719 goto err;
12720
12721 se = kzalloc_node(sizeof(struct sched_entity_stats),
12722 GFP_KERNEL, cpu_to_node(i));
12723 if (!se)
12724 goto err_free_rq;
12725
12726 init_cfs_rq(cfs_rq);
12727 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
12728 sched_core_init_cfs_rq(tg, cfs_rq);
12729 init_entity_runnable_average(se);
12730 }
12731
12732 return 1;
12733
12734 err_free_rq:
12735 kfree(cfs_rq);
12736 err:
12737 return 0;
12738 }
12739

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-11-15 21:00:57

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime

Hi Cruz,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.7-rc1 next-20231115]
[cannot apply to tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Cruz-Zhao/sched-core-introduce-core_id-to-struct-rq/20231115-193559
base: linus/master
patch link: https://lore.kernel.org/r/20231115113341.13261-4-CruzZhao%40linux.alibaba.com
patch subject: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime
config: i386-buildonly-randconfig-005-20231116 (https://download.01.org/0day-ci/archive/20231116/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231116/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

In file included from kernel/sched/fair.c:54:
kernel/sched/fair.c: In function 'update_min_vruntime':
>> kernel/sched/fair.c:802:37: error: 'struct cfs_rq' has no member named 'core_min_vruntime_copy'; did you mean 'core_min_vruntime'?
802 | u64_u32_store(cfs_rq->core->core_min_vruntime,
| ^~~~~~~~~~~~~~~~~
kernel/sched/sched.h:528:9: note: in definition of macro 'u64_u32_store_copy'
528 | copy = __val; \
| ^~~~
kernel/sched/fair.c:802:9: note: in expansion of macro 'u64_u32_store'
802 | u64_u32_store(cfs_rq->core->core_min_vruntime,
| ^~~~~~~~~~~~~
kernel/sched/fair.c: At top level:
kernel/sched/fair.c:12462:6: warning: no previous prototype for 'sched_core_init_cfs_rq' [-Wmissing-prototypes]
12462 | void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq)
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c: In function 'init_cfs_rq':
kernel/sched/fair.c:12698:31: error: 'struct cfs_rq' has no member named 'core_min_vruntime_copy'; did you mean 'core_min_vruntime'?
12698 | u64_u32_store(cfs_rq->core_min_vruntime, (u64)(-(1LL << 20)));
| ^~~~~~~~~~~~~~~~~
kernel/sched/sched.h:528:9: note: in definition of macro 'u64_u32_store_copy'
528 | copy = __val; \
| ^~~~
kernel/sched/fair.c:12698:9: note: in expansion of macro 'u64_u32_store'
12698 | u64_u32_store(cfs_rq->core_min_vruntime, (u64)(-(1LL << 20)));
| ^~~~~~~~~~~~~
kernel/sched/fair.c: At top level:
kernel/sched/fair.c:12985:6: warning: no previous prototype for 'free_fair_sched_group' [-Wmissing-prototypes]
12985 | void free_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12987:5: warning: no previous prototype for 'alloc_fair_sched_group' [-Wmissing-prototypes]
12987 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12992:6: warning: no previous prototype for 'online_fair_sched_group' [-Wmissing-prototypes]
12992 | void online_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12994:6: warning: no previous prototype for 'unregister_fair_sched_group' [-Wmissing-prototypes]
12994 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +802 kernel/sched/fair.c

772
773 if (curr) {
774 if (curr->on_rq) {
775 vruntime = curr->vruntime;
776 #ifdef CONFIG_SCHED_CORE
777 core_vruntime = curr->core_vruntime;
778 #endif
779 } else {
780 curr = NULL;
781 }
782 }
783
784 if (se) {
785 if (!curr) {
786 vruntime = se->vruntime;
787 #ifdef CONFIG_SCHED_CORE
788 core_vruntime = se->core_vruntime;
789 #endif
790 } else {
791 vruntime = min_vruntime(vruntime, se->vruntime);
792 #ifdef CONFIG_SCHED_CORE
793 core_vruntime = min_vruntime(core_vruntime, se->core_vruntime);
794 #endif
795 }
796 }
797
798 /* ensure we never gain time by being placed backwards. */
799 u64_u32_store(cfs_rq->min_vruntime,
800 __update_min_vruntime(cfs_rq, vruntime));
801 #ifdef CONFIG_SCHED_CORE
> 802 u64_u32_store(cfs_rq->core->core_min_vruntime,
803 __update_core_min_vruntime(cfs_rq->core, vruntime));
804 #endif
805 }
806

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-11-16 06:38:56

by Cruz Zhao

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime



在 2023/11/15 下午11:22, Peter Zijlstra 写道:
> On Wed, Nov 15, 2023 at 09:42:13PM +0800, cruzzhao wrote:
>>
>>
>> 在 2023/11/15 下午8:20, Peter Zijlstra 写道:
>>> On Wed, Nov 15, 2023 at 07:33:40PM +0800, Cruz Zhao wrote:
>>>> To compare the priority of sched_entity from different cpus of a core,
>>>> we introduce core_vruntime to struct sched_entity and core_min_vruntime
>>>> to struct cfs_rq.
>>>>
>>>> cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
>>>> of the same task_group among the core, and se->core_vruntime is the
>>>> vruntime relative to se->cfs_rq->core->core_min_vruntime.
>>>
>>> But that makes absolutely no sense. vruntime of different RQs can
>>> advance at wildly different rates. Not to mention there's this random
>>> offset between them.
>>>
>>> No, this cannot be.
>>
>> Force idle vruntime snapshot does the same thing, comparing
>> sea->vruntime - cfs_rqa->min_vruntime_fi with seb->vruntime -
>> cfs_rqb->min_vruntime_fi, while sea and seb may have wildly different rates.
>
> But that subtracts a from a and b from b, it doesn't mix a and b.
>
> Note that se->vruntime - cfs_rq->min_vruntime is a very poor
> approximation of lag. We have actual lag now.
>
> Note that:
>
> (sea - seb) + (min_fib - min_fia) =
> (sea - min_fia) + (min_fib - seb) =
> (sea - min_fia) - (seb - min_fib) =
> 'lag'a - 'lag'b
>
> It doesn't mix absolute a and b terms anywhere.
>
>> Actually, cfs_rq->core->core_min_vruntime does the same thing as
>> cfs_rq->min_vruntime_fi, providing a baseline, but
>> cfs_rq->core->core_min_vruntime is more accurate.
>
> min(cfs_rqa, cfs_rqb) is nonsense. And I can't see how min_vruntime_fi
> would do anything like that.
>
>> I've tried to implement a fair enough mechanism of core_vruntime, but
>> it's too complex because of the weight, and it costs a lot. So this is a
>> compromise solution.
>
> 'this' is complete nonsense and not motivated by any math.
>
>> BTW, is there any other solutions to solve this problem?
>
> Well, this is where it all started:
>
> https://lkml.kernel.org/r/20200506143506.GH5298%40hirez.programming.kicks-ass.net
>
> The above lag thing is detailed in a follow up:
>
> https://lkml.kernel.org/r/20200515103844.GG2978%40hirez.programming.kicks-ass.net

Many thanks, I'll study the discussion about this.

>
> Anyway, I think the first of those links has the start of the
> multi-queue formalism, see the S_k+l bits. Work that out and see where
> it ends.
>
> I did go a bit further, but I've forgotten everything since, it's been
> years.
>
> Anyway, nothing like this goes in without a fairly solid bit of math in
> the changelog to justify it.
>
> Also, I think Joel complained about something like this at some point,
> and he wanted to update the core tree more often, because IIRc his
> observation was that things got stale or something.

Many thanks for reviewing. I'll think about this more comprehensively.

Best,
Cruz Zhao

2023-11-17 02:48:50

by Cruz Zhao

[permalink] [raw]
Subject: Re: [PATCH 3/4] sched/fair: introduce core_vruntime and core_min_vruntime



在 2023/11/15 下午11:22, Peter Zijlstra 写道:
> On Wed, Nov 15, 2023 at 09:42:13PM +0800, cruzzhao wrote:
>>
>>
>> 在 2023/11/15 下午8:20, Peter Zijlstra 写道:
>>> On Wed, Nov 15, 2023 at 07:33:40PM +0800, Cruz Zhao wrote:
>>>> To compare the priority of sched_entity from different cpus of a core,
>>>> we introduce core_vruntime to struct sched_entity and core_min_vruntime
>>>> to struct cfs_rq.
>>>>
>>>> cfs_rq->core->core_min_vruntime records the min vruntime of the cfs_rqs
>>>> of the same task_group among the core, and se->core_vruntime is the
>>>> vruntime relative to se->cfs_rq->core->core_min_vruntime.
>>>
>>> But that makes absolutely no sense. vruntime of different RQs can
>>> advance at wildly different rates. Not to mention there's this random
>>> offset between them.
>>>
>>> No, this cannot be.
>>
>> Force idle vruntime snapshot does the same thing, comparing
>> sea->vruntime - cfs_rqa->min_vruntime_fi with seb->vruntime -
>> cfs_rqb->min_vruntime_fi, while sea and seb may have wildly different rates.
>
> But that subtracts a from a and b from b, it doesn't mix a and b.
>
> Note that se->vruntime - cfs_rq->min_vruntime is a very poor
> approximation of lag. We have actual lag now.
>
> Note that:
>
> (sea - seb) + (min_fib - min_fia) =
> (sea - min_fia) + (min_fib - seb) =
> (sea - min_fia) - (seb - min_fib) =
> 'lag'a - 'lag'b
>
> It doesn't mix absolute a and b terms anywhere.
>
>> Actually, cfs_rq->core->core_min_vruntime does the same thing as
>> cfs_rq->min_vruntime_fi, providing a baseline, but
>> cfs_rq->core->core_min_vruntime is more accurate.
>
> min(cfs_rqa, cfs_rqb) is nonsense. And I can't see how min_vruntime_fi
> would do anything like that.
>

Introducing core_vruntime and core_min_vruntime is a try to maintain a
single core wide cfs_rq, abstracting vruntime, and core_min_vruntime
doesn't equal to min(cfs_rqa, cfs_rqb).

Note that:
sea->core_vruntime - seb->core_vruntime =
sea->core_vruntime - seb->core_vruntime + core_min_vruntime -
core_min_cruntime =
(sea->core_vruntime - core_min_vruntime) - (seb->core_vruntime -
core_min_vruntime) =
'lag'a - 'lag'b

The problem about wildly different vruntime rates also happens with
vruntime snapshot. Consider the case that a core always force idle some
SMT, and the min_vruntime_fi will never update. In this case, 'lag'a and
'lag'b increase according to their respective weights in cfs, instead of
the core wide weights.

Afaic, there is no perfect solution or mechanism to solve this problem
yet, but I'll try.

Best,
Cruz Zhao

2023-11-18 10:49:13

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 2/4] sched/core: introduce core to struct cfs_rq

Hi Cruz,

kernel test robot noticed the following build warnings:

[auto build test WARNING on linus/master]
[also build test WARNING on v6.7-rc1 next-20231117]
[cannot apply to tip/sched/core]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Cruz-Zhao/sched-core-introduce-core_id-to-struct-rq/20231115-193559
base: linus/master
patch link: https://lore.kernel.org/r/20231115113341.13261-3-CruzZhao%40linux.alibaba.com
patch subject: [PATCH 2/4] sched/core: introduce core to struct cfs_rq
config: x86_64-randconfig-123-20231118 (https://download.01.org/0day-ci/archive/20231118/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231118/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

sparse warnings: (new ones prefixed by >>)
kernel/sched/fair.c:1178:34: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_entity const *se @@ got struct sched_entity [noderef] __rcu * @@
kernel/sched/fair.c:1178:34: sparse: expected struct sched_entity const *se
kernel/sched/fair.c:1178:34: sparse: got struct sched_entity [noderef] __rcu *
kernel/sched/fair.c:2949:13: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct task_struct *tsk @@ got struct task_struct [noderef] __rcu * @@
kernel/sched/fair.c:2949:13: sparse: expected struct task_struct *tsk
kernel/sched/fair.c:2949:13: sparse: got struct task_struct [noderef] __rcu *
kernel/sched/fair.c:12185:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:12185:9: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/fair.c:12185:9: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/fair.c:7801:20: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:7801:20: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/fair.c:7801:20: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/fair.c:8006:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] tmp @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:8006:9: sparse: expected struct sched_domain *[assigned] tmp
kernel/sched/fair.c:8006:9: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/fair.c:8105:38: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct task_struct *curr @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/fair.c:8105:38: sparse: expected struct task_struct *curr
kernel/sched/fair.c:8105:38: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/fair.c:8385:38: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct task_struct *curr @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/fair.c:8385:38: sparse: expected struct task_struct *curr
kernel/sched/fair.c:8385:38: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/fair.c:9376:40: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain *child @@ got struct sched_domain [noderef] __rcu *child @@
kernel/sched/fair.c:9376:40: sparse: expected struct sched_domain *child
kernel/sched/fair.c:9376:40: sparse: got struct sched_domain [noderef] __rcu *child
kernel/sched/fair.c:10013:22: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/fair.c:10013:22: sparse: struct task_struct [noderef] __rcu *
kernel/sched/fair.c:10013:22: sparse: struct task_struct *
kernel/sched/fair.c:11445:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:11445:9: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/fair.c:11445:9: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/fair.c:11102:44: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_domain *sd_parent @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:11102:44: sparse: expected struct sched_domain *sd_parent
kernel/sched/fair.c:11102:44: sparse: got struct sched_domain [noderef] __rcu *parent
kernel/sched/fair.c:11541:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct sched_domain *[assigned] sd @@ got struct sched_domain [noderef] __rcu *parent @@
kernel/sched/fair.c:11541:9: sparse: expected struct sched_domain *[assigned] sd
kernel/sched/fair.c:11541:9: sparse: got struct sched_domain [noderef] __rcu *parent
>> kernel/sched/fair.c:12423:6: sparse: sparse: symbol 'sched_core_init_cfs_rq' was not declared. Should it be static?
kernel/sched/fair.c:6418:35: sparse: sparse: marked inline, but without a definition
kernel/sched/fair.c: note: in included file:
kernel/sched/sched.h:2283:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2283:9: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2283:9: sparse: struct task_struct *
kernel/sched/sched.h:2119:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2119:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2119:25: sparse: struct task_struct *
kernel/sched/sched.h:2119:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2119:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2119:25: sparse: struct task_struct *

vim +/sched_core_init_cfs_rq +12423 kernel/sched/fair.c

12422
12423 void sched_core_init_cfs_rq(struct task_group *tg, struct cfs_rq *cfs_rq)
12424 {
12425 #ifdef CONFIG_FAIR_GROUP_SCHED
12426 struct rq *rq = rq_of(cfs_rq);
12427 int core_id = rq->core_id;
12428
12429 cfs_rq->core = tg->cfs_rq[core_id];
12430 #endif
12431 }
12432

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki