2019-02-18 19:06:28

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

Instead of only selecting a local task, select a task for all SMT
siblings for every reschedule on the core (irrespective which logical
CPU does the reschedule).

NOTE: there is still potential for siblings rivalry.
NOTE: this is far too complicated; but thus far I've failed to
simplify it further.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 5 -
2 files changed, 224 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3552,7 +3552,7 @@ static inline void schedule_debug(struct
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -3597,6 +3597,220 @@ pick_next_task(struct rq *rq, struct tas
BUG();
}

+#ifdef CONFIG_SCHED_CORE
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_idle_task(a) || is_idle_task(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = 0UL;
+
+ /*
+ * We must not rely on rq->core->core_cookie here, because we fail to reset
+ * rq->core->core_cookie on new picks, such that we can detect if we need
+ * to do single vs multi rq task selection.
+ */
+
+ if (max && max->core_cookie) {
+ WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
+ cookie = max->core_cookie;
+ }
+
+ class_pick = class->pick_task(rq);
+ if (!cookie)
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+ if (!class_pick)
+ return cookie_pick;
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (cpu_prio_less(cookie_pick, class_pick) && cpu_prio_less(max, class_pick))
+ return class_pick;
+
+ return cookie_pick;
+}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *max = NULL;
+ const struct sched_class *class;
+ const struct cpumask *smt_mask;
+ int i, j, cpu;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+ return next;
+ }
+
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
+ cpu = cpu_of(rq);
+ smt_mask = cpu_smt_mask(cpu);
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /* reset state */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ rq_i->core_pick = NULL;
+
+ if (i != cpu)
+ update_rq_clock(rq_i);
+ }
+
+ /*
+ * Try and select tasks for each sibling in decending sched_class
+ * order.
+ */
+ for_each_class(class) {
+again:
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ struct rq *rq_i = cpu_rq(i);
+ struct task_struct *p;
+
+ if (rq_i->core_pick)
+ continue;
+
+ /*
+ * If this sibling doesn't yet have a suitable task to
+ * run; ask for the most elegible task, given the
+ * highest priority task already selected for this
+ * core.
+ */
+ p = pick_task(rq_i, class, max);
+ if (!p) {
+ /*
+ * If there weren't no cookies; we don't need
+ * to bother with the other siblings.
+ */
+ if (i == cpu && !rq->core->core_cookie)
+ goto next_class;
+
+ continue;
+ }
+
+ /*
+ * Optimize the 'normal' case where there aren't any
+ * cookies and we don't need to sync up.
+ */
+ if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
+ next = p;
+ goto done;
+ }
+
+ rq_i->core_pick = p;
+
+ /*
+ * If this new candidate is of higher priority than the
+ * previous; and they're incompatible; we need to wipe
+ * the slate and start over.
+ *
+ * NOTE: this is a linear max-filter and is thus bounded
+ * in execution time.
+ */
+ if (!max || core_prio_less(max, p)) {
+ struct task_struct *old_max = max;
+
+ rq->core->core_cookie = p->core_cookie;
+ max = p;
+
+ if (old_max && !cookie_match(old_max, p)) {
+ for_each_cpu(j, smt_mask) {
+ if (j == i)
+ continue;
+
+ cpu_rq(j)->core_pick = NULL;
+ }
+ goto again;
+ }
+ }
+ }
+next_class:;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ WARN_ON_ONCE(!rq_i->core_pick);
+
+ if (i == cpu)
+ continue;
+
+ if (rq_i->curr != rq_i->core_pick)
+ resched_curr(rq_i);
+ }
+
+ rq->core_sched_seq = rq->core->core_pick_seq;
+ next = rq->core_pick;
+
+done:
+ set_next_task(rq, next);
+ return next;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -5866,7 +6080,7 @@ static void migrate_tasks(struct rq *dea
/*
* pick_next_task() assumes pinned rq->lock:
*/
- next = pick_next_task(rq, &fake_task, rf);
+ next = __pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
put_prev_task(rq, next);

@@ -6322,7 +6536,11 @@ void __init sched_init(void)

#ifdef CONFIG_SCHED_CORE
rq->core = NULL;
+ rq->core_pick = NULL;
rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+
+ rq->core_cookie = 0UL;
#endif
}

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -960,11 +960,15 @@ struct rq {
#ifdef CONFIG_SCHED_CORE
/* per rq */
struct rq *core;
+ struct task_struct *core_pick;
unsigned int core_enabled;
+ unsigned int core_sched_seq;
struct rb_root core_tree;

/* shared state */
unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
#endif
};

@@ -1770,7 +1774,6 @@ static inline void put_prev_task(struct

static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next);
}





2019-04-02 09:20:54

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 02, 2019 at 02:46:13PM +0800, Aaron Lu wrote:
> On Mon, Feb 18, 2019 at 05:56:33PM +0100, Peter Zijlstra wrote:

> > +static struct task_struct *
> > +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> > +{
> > + struct task_struct *class_pick, *cookie_pick;
> > + unsigned long cookie = 0UL;
> > +
> > + /*
> > + * We must not rely on rq->core->core_cookie here, because we fail to reset
> > + * rq->core->core_cookie on new picks, such that we can detect if we need
> > + * to do single vs multi rq task selection.
> > + */
> > +
> > + if (max && max->core_cookie) {
> > + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> > + cookie = max->core_cookie;
> > + }
> > +
> > + class_pick = class->pick_task(rq);
> > + if (!cookie)
> > + return class_pick;
> > +
> > + cookie_pick = sched_core_find(rq, cookie);
> > + if (!class_pick)
> > + return cookie_pick;
> > +
> > + /*
> > + * If class > max && class > cookie, it is the highest priority task on
> > + * the core (so far) and it must be selected, otherwise we must go with
> > + * the cookie pick in order to satisfy the constraint.
> > + */
> > + if (cpu_prio_less(cookie_pick, class_pick) && cpu_prio_less(max, class_pick))
> > + return class_pick;
>
> I have a question about the use of cpu_prio_less(max, class_pick) here
> and core_prio_less(max, p) below in pick_next_task().
>
> Assume cpu_prio_less(max, class_pick) thinks class_pick has higher
> priority and class_pick is returned here. Then in pick_next_task(),
> core_prio_less(max, p) is used to decide if max should be replaced.
> Since core_prio_less(max, p) doesn't compare vruntime, it could return
> fasle for this class_pick and the same max. Then max isn't replaced
> and we could end up scheduling two processes belonging to two different
> cgroups...

> > +
> > + return cookie_pick;
> > +}
> > +
> > +static struct task_struct *
> > +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > +{

> > + /*
> > + * If this new candidate is of higher priority than the
> > + * previous; and they're incompatible; we need to wipe
> > + * the slate and start over.
> > + *
> > + * NOTE: this is a linear max-filter and is thus bounded
> > + * in execution time.
> > + */
> > + if (!max || core_prio_less(max, p)) {
>
> This is the place to decide if max should be replaced.

Hummm.... very good spotting that. Yes, I'm afraid you're very much
right about this.

> Perhaps we can test if max is on the same cpu as class_pick and then
> use cpu_prio_less() or core_prio_less() accordingly here, or just
> replace core_prio_less(max, p) with cpu_prio_less(max, p) in
> pick_next_task(). The 2nd obviously breaks the comment of
> core_prio_less() though: /* cannot compare vruntime across CPUs */.

Right, so as the comment states, you cannot directly compare vruntime
across CPUs, doing that is completely buggered.

That also means that the cpu_prio_less(max, class_pick) in pick_task()
is buggered, because there is no saying @max is on this CPU to begin
with.

Changing that to core_prio_less() doesn't fix this though.

> I'm still evaluating, your comments are appreciated.

We could change the above condition to:

if (!max || !cookie_match(max, p))

I suppose. But please double check the thikning.



> > + struct task_struct *old_max = max;
> > +
> > + rq->core->core_cookie = p->core_cookie;
> > + max = p;
> > +
> > + if (old_max && !cookie_match(old_max, p)) {
> > + for_each_cpu(j, smt_mask) {
> > + if (j == i)
> > + continue;
> > +
> > + cpu_rq(j)->core_pick = NULL;
> > + }
> > + goto again;
> > + }
> > + }
> > + }
> > +next_class:;
> > + }


Another approach would be something like the below:


--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
*/

/* real prio, less is less */
-static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool runtime)
+static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
{
int pa = __task_prio(a), pb = __task_prio(b);

@@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);

- if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
- return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return !((s64)(a->se.vruntime - vruntime) < 0);

return false;
}

static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
{
- return __prio_less(a, b, true);
+ return __prio_less(a, b, b->se.vruntime);
}

static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
{
- /* cannot compare vruntime across CPUs */
- return __prio_less(a, b, false);
+ u64 vruntime = b->se.vruntime;
+
+ vruntime -= task_rq(b)->cfs.min_vruntime;
+ vruntime += task_rq(a)->cfs.min_vruntime
+
+ return __prio_less(a, b, vruntime);
}

static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)

2019-04-02 13:37:32

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> Another approach would be something like the below:
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
> */
>
> /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
> {
> int pa = __task_prio(a), pb = __task_prio(b);
>
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
>
> return false;
> }
>
> static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
> }
>
> static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime
> +
> + return __prio_less(a, b, vruntime);
> }
>
> static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)

Brilliant, I like this approach, it makes core_prio_less() work across
CPUs. So I tested this, together with changing
cpu_prio_less(max, class_pick) to core_prio_less(max, class_pick) in
pick_task(), this problem is gone :-)

I verified with below debug code:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb24a0141e57..50658e79363f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3832,6 +3832,14 @@ next_class:;

WARN_ON_ONCE(!rq_i->core_pick);

+ if (rq->core->core_cookie && rq_i->core_pick->core_cookie &&
+ rq->core->core_cookie != rq_i->core_pick->core_cookie) {
+ trace_printk("expect 0x%lx, cpu%d got 0x%lx\n",
+ rq->core->core_cookie, i,
+ rq_i->core_pick->core_cookie);
+ WARN_ON_ONCE(1);
+ }
+
rq_i->core_pick->core_occupation = occ;

if (i == cpu)
--
2.19.1.3.ge56e4f7

2019-04-05 14:59:15

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> Another approach would be something like the below:
>
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
> */
>
> /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
> {
> int pa = __task_prio(a), pb = __task_prio(b);
>
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
~~~
I think <= should be used here, so that two tasks with the same vruntime
will return false. Or we could bounce two tasks having different tags
with one set to max in the first round and the other set to max in the
next round. CPU would stuck in __schedule() with irq disabled.

>
> return false;
> }
>
> static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
> }
>
> static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime

After some testing, I figured task_cfs_rq() should be used instead of
task_rq(:-)

With the two changes(and some other minor ones that still need more time
to sort out), I'm now able to start doing 2 full CPU kbuilds in 2 tagged
cgroups. Previouslly, the system would hang pretty soon after I started
kbuild in any tagged cgroup(presumbly, CPUs stucked in __schedule() with
irqs disabled).

And there is no warning appeared due to two tasks having different tags
get scheduled on the same CPU.

Thanks,
Aaron

> +
> + return __prio_less(a, b, vruntime);
> }
>
> static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)

2019-04-09 18:10:41

by Tim Chen

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On 4/5/19 7:55 AM, Aaron Lu wrote:
> On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
>> Another approach would be something like the below:
>>
>>
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
>> */
>>
>> /* real prio, less is less */
>> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool runtime)
>> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
>> {
>> int pa = __task_prio(a), pb = __task_prio(b);
>>
>> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
>> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
>> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>>
>> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
>> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
>> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
>> + return !((s64)(a->se.vruntime - vruntime) < 0);
> ~~~
> I think <= should be used here, so that two tasks with the same vruntime
> will return false. Or we could bounce two tasks having different tags
> with one set to max in the first round and the other set to max in the
> next round. CPU would stuck in __schedule() with irq disabled.
>
>>
>> return false;
>> }
>>
>> static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
>> {
>> - return __prio_less(a, b, true);
>> + return __prio_less(a, b, b->se.vruntime);
>> }
>>
>> static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
>> {
>> - /* cannot compare vruntime across CPUs */
>> - return __prio_less(a, b, false);
>> + u64 vruntime = b->se.vruntime;
>> +
>> + vruntime -= task_rq(b)->cfs.min_vruntime;
>> + vruntime += task_rq(a)->cfs.min_vruntime
>
> After some testing, I figured task_cfs_rq() should be used instead of
> task_rq(:-)
>
> With the two changes(and some other minor ones that still need more time
> to sort out), I'm now able to start doing 2 full CPU kbuilds in 2 tagged
> cgroups. Previouslly, the system would hang pretty soon after I started
> kbuild in any tagged cgroup(presumbly, CPUs stucked in __schedule() with
> irqs disabled).
>
> And there is no warning appeared due to two tasks having different tags
> get scheduled on the same CPU.
>
> Thanks,
> Aaron
>

Peter,

Now that we have accumulated quite a number of different fixes to your orginal
posted patches. Would you like to post a v2 of the core scheduler with the fixes?

Thanks.

Tim

2019-04-09 18:40:23

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

We found the source of the major performance regression we discussed
previously. It turns out there was a pattern where a task (a kworker in this
case) could be woken up, but the core could still end up idle before that
task had a chance to run.

Example sequence, cpu0 and cpu1 and siblings on the same core, task1 and
task2 are in the same cgroup with the tag enabled (each following line
happens in the increasing order of time):
- task1 running on cpu0, task2 running on cpu1
- sched_waking(kworker/0, target_cpu=cpu0)
- task1 scheduled out of cpu0
- kworker/0 cannot run on cpu0 because of task2 is still running on cpu1
cpu0 is idle
- task2 scheduled out of cpu1
- cpu1 doesn’t select kworker/0 for cpu0, because the optimization path ends
the task selection if core_cookie is NULL for currently selected process
and the cpu1’s runqueue.
- cpu1 is idle
--> both siblings are idle but kworker/0 is still in the run queue of cpu0.
Cpu0 may stay idle for longer if it goes deep idle.

With the fix below, we ensure to send an IPI to the sibling if it is idle
and has tasks waiting in its runqueue.
This fixes the performance issue we were seeing.

Now here is what we can measure with a disk write-intensive benchmark:
- no performance impact with enabling core scheduling without any tagged
task,
- 5% overhead if one tagged task is competing with an untagged task,
- 10% overhead if 2 tasks tagged with a different tag are competing
against each other.

We are starting more scaling tests, but this is very encouraging !


diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1fa10561279..02c862a5e973 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3779,7 +3779,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

trace_printk("unconstrained pick: %s/%d %lx\n",
next->comm, next->pid, next->core_cookie);
+ rq->core_pick = NULL;

+ /*
+ * If the sibling is idling, we might want to wake it
+ * so that it can check for any runnable but blocked tasks
+ * due to previous task matching.
+ */
+ for_each_cpu(j, smt_mask) {
+ struct rq *rq_j = cpu_rq(j);
+ rq_j->core_pick = NULL;
+ if (j != cpu && is_idle_task(rq_j->curr) && rq_j->nr_running) {
+ resched_curr(rq_j);
+ trace_printk("IPI(%d->%d[%d]) idle preempt\n",
+ cpu, j, rq_j->nr_running);
+ }
+ }
goto done;
}

2019-04-10 05:30:00

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> Now that we have accumulated quite a number of different fixes to your orginal
> posted patches. Would you like to post a v2 of the core scheduler with the fixes?

One more question I'm not sure: should a task with cookie=0, i.e. tasks
that are untagged, be allowed to scheduled on the the same core with
another tagged task?

The current patch seems to disagree on this, e.g. in pick_task(),
if max is already chosen but max->core_cookie == 0, then we didn't care
about cookie and simply use class_pick for the other cpu. This means we
could schedule two tasks with different cookies(one is zero and the
other can be tagged).

But then sched_core_find() only allow idle task to match with any tagged
tasks(we didn't place untagged tasks to the core tree of course :-).

Thoughts? Do I understand this correctly? If so, I think we probably
want to make this clear before v2. I personally feel, we shouldn't allow
untagged tasks(like kernel threads) to match with tagged tasks.

2019-04-10 08:10:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> Now that we have accumulated quite a number of different fixes to your orginal
> posted patches. Would you like to post a v2 of the core scheduler with the fixes?

Well, I was promised someome else was going to carry all this, also,
while you're all having fun playing with this, I've not yet had answers
to the important questions of how L1TF complete we want to be and if all
this crud actually matters one way or the other.

Also, I still don't see this stuff working for high context switch rate
workloads, and that is exactly what some people were aiming for..


2019-04-10 14:45:59

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Wed, Apr 10, 2019 at 12:36:33PM +0800, Aaron Lu wrote:
> On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > Now that we have accumulated quite a number of different fixes to your orginal
> > posted patches. Would you like to post a v2 of the core scheduler with the fixes?
>
> One more question I'm not sure: should a task with cookie=0, i.e. tasks
> that are untagged, be allowed to scheduled on the the same core with
> another tagged task?

That was not meant to be possible.

> The current patch seems to disagree on this, e.g. in pick_task(),
> if max is already chosen but max->core_cookie == 0, then we didn't care
> about cookie and simply use class_pick for the other cpu. This means we
> could schedule two tasks with different cookies(one is zero and the
> other can be tagged).

When core_cookie==0 we shouldn't schedule the other siblings at all.

> But then sched_core_find() only allow idle task to match with any tagged
> tasks(we didn't place untagged tasks to the core tree of course :-).
>
> Thoughts? Do I understand this correctly? If so, I think we probably
> want to make this clear before v2. I personally feel, we shouldn't allow
> untagged tasks(like kernel threads) to match with tagged tasks.

Agreed, cookie should always match or idle.

2019-04-10 15:03:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 09, 2019 at 02:38:55PM -0400, Julien Desfossez wrote:
> We found the source of the major performance regression we discussed
> previously. It turns out there was a pattern where a task (a kworker in this
> case) could be woken up, but the core could still end up idle before that
> task had a chance to run.
>
> Example sequence, cpu0 and cpu1 and siblings on the same core, task1 and
> task2 are in the same cgroup with the tag enabled (each following line
> happens in the increasing order of time):
> - task1 running on cpu0, task2 running on cpu1
> - sched_waking(kworker/0, target_cpu=cpu0)
> - task1 scheduled out of cpu0
> - kworker/0 cannot run on cpu0 because of task2 is still running on cpu1
> cpu0 is idle
> - task2 scheduled out of cpu1

But at this point core_cookie is still set; we don't clear it when the
last task goes away.

> - cpu1 doesn’t select kworker/0 for cpu0, because the optimization path ends
> the task selection if core_cookie is NULL for currently selected process
> and the cpu1’s runqueue.

But at this point core_cookie is still set, we only (re)set it later to
p->core_cookie.

What I suspect happens is that you hit the 'again' clause due to a
higher prio @max on the second sibling. And at that point we've
destroyed core_cookie.

> - cpu1 is idle
> --> both siblings are idle but kworker/0 is still in the run queue of cpu0.
> Cpu0 may stay idle for longer if it goes deep idle.
>
> With the fix below, we ensure to send an IPI to the sibling if it is idle
> and has tasks waiting in its runqueue.
> This fixes the performance issue we were seeing.
>
> Now here is what we can measure with a disk write-intensive benchmark:
> - no performance impact with enabling core scheduling without any tagged
> task,
> - 5% overhead if one tagged task is competing with an untagged task,
> - 10% overhead if 2 tasks tagged with a different tag are competing
> against each other.
>
> We are starting more scaling tests, but this is very encouraging !
>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e1fa10561279..02c862a5e973 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3779,7 +3779,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> trace_printk("unconstrained pick: %s/%d %lx\n",
> next->comm, next->pid, next->core_cookie);
> + rq->core_pick = NULL;
>
> + /*
> + * If the sibling is idling, we might want to wake it
> + * so that it can check for any runnable but blocked tasks
> + * due to previous task matching.
> + */
> + for_each_cpu(j, smt_mask) {
> + struct rq *rq_j = cpu_rq(j);
> + rq_j->core_pick = NULL;
> + if (j != cpu && is_idle_task(rq_j->curr) && rq_j->nr_running) {
> + resched_curr(rq_j);
> + trace_printk("IPI(%d->%d[%d]) idle preempt\n",
> + cpu, j, rq_j->nr_running);
> + }
> + }
> goto done;
> }

I'm thinking there is a more elegant solution hiding in there; possibly
saving/restoring that core_cookie on the again loop should do, but I've
always had the nagging suspicion that whole selection loop could be done
better.

2019-04-10 17:39:52

by Aubrey Li

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Wed, Apr 10, 2019 at 12:36 PM Aaron Lu <[email protected]> wrote:
>
> On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > Now that we have accumulated quite a number of different fixes to your orginal
> > posted patches. Would you like to post a v2 of the core scheduler with the fixes?
>
> One more question I'm not sure: should a task with cookie=0, i.e. tasks
> that are untagged, be allowed to scheduled on the the same core with
> another tagged task?
>
> The current patch seems to disagree on this, e.g. in pick_task(),
> if max is already chosen but max->core_cookie == 0, then we didn't care
> about cookie and simply use class_pick for the other cpu. This means we
> could schedule two tasks with different cookies(one is zero and the
> other can be tagged).
>
> But then sched_core_find() only allow idle task to match with any tagged
> tasks(we didn't place untagged tasks to the core tree of course :-).
>
> Thoughts? Do I understand this correctly? If so, I think we probably
> want to make this clear before v2. I personally feel, we shouldn't allow
> untagged tasks(like kernel threads) to match with tagged tasks.

Does it make sense if we take untagged tasks as hypervisor, and different
cookie tasks as different VMs? Isolation is done between VMs, not between
VM and hypervisor.

Did you see anything harmful if an untagged task and a tagged task
run simultaneously on the same core?

Thanks,
-Aubrey

2019-04-10 19:59:37

by Vineeth Remanan Pillai

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

From: Vineeth Pillai <[email protected]>

> Well, I was promised someome else was going to carry all this, also

We are interested in this feature and have been actively testing, benchmarking
and working on fixes. If there is no v2 effort currently in progress, we are
willing to help consolidate all the changes discussed here and prepare a v2.
If there are any pending changes in pipeline, please post your ideas so that
we could include it in v2.

We hope to post the v2 with all the changes here in a week’s time rebased on
the latest tip.

2019-04-11 00:18:30

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.


On 4/9/19 11:38 AM, Julien Desfossez wrote:
> We found the source of the major performance regression we discussed
> previously. It turns out there was a pattern where a task (a kworker in this
> case) could be woken up, but the core could still end up idle before that
> task had a chance to run.
>
> Example sequence, cpu0 and cpu1 and siblings on the same core, task1 and
> task2 are in the same cgroup with the tag enabled (each following line
> happens in the increasing order of time):
> - task1 running on cpu0, task2 running on cpu1
> - sched_waking(kworker/0, target_cpu=cpu0)
> - task1 scheduled out of cpu0
> - kworker/0 cannot run on cpu0 because of task2 is still running on cpu1
> cpu0 is idle
> - task2 scheduled out of cpu1
> - cpu1 doesn’t select kworker/0 for cpu0, because the optimization path ends
> the task selection if core_cookie is NULL for currently selected process
> and the cpu1’s runqueue.
> - cpu1 is idle
> --> both siblings are idle but kworker/0 is still in the run queue of cpu0.
> Cpu0 may stay idle for longer if it goes deep idle.
>
> With the fix below, we ensure to send an IPI to the sibling if it is idle
> and has tasks waiting in its runqueue.
> This fixes the performance issue we were seeing.
>
> Now here is what we can measure with a disk write-intensive benchmark:
> - no performance impact with enabling core scheduling without any tagged
> task,
> - 5% overhead if one tagged task is competing with an untagged task,
> - 10% overhead if 2 tasks tagged with a different tag are competing
> against each other.
>
> We are starting more scaling tests, but this is very encouraging !
>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e1fa10561279..02c862a5e973 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3779,7 +3779,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> trace_printk("unconstrained pick: %s/%d %lx\n",
> next->comm, next->pid, next->core_cookie);
> + rq->core_pick = NULL;
>
> + /*
> + * If the sibling is idling, we might want to wake it
> + * so that it can check for any runnable but blocked tasks
> + * due to previous task matching.
> + */
> + for_each_cpu(j, smt_mask) {
> + struct rq *rq_j = cpu_rq(j);
> + rq_j->core_pick = NULL;
> + if (j != cpu && is_idle_task(rq_j->curr) && rq_j->nr_running) {
> + resched_curr(rq_j);
> + trace_printk("IPI(%d->%d[%d]) idle preempt\n",
> + cpu, j, rq_j->nr_running);
> + }
> + }
> goto done;
> }
>
I see similar improvement with this patch as removing the condition I
earlier mentioned. So that's not needed. I also included the patch for the
priority fix. For 2 DB instances, HT disabling stands at -22% for 32 users
(from earlier emails).


1 DB instance

users  baseline   %idle    core_sched %idle
16     1          84       -4.9% 84
24     1          76       -6.7% 75
32     1          69       -2.4% 69

2 DB instance

users  baseline   %idle    core_sched %idle
16     1          66       -19.5% 69
24     1          54       -9.8% 57
32     1          42       -27.2%        48

2019-04-11 02:13:05

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Wed, Apr 10, 2019 at 10:18:10PM +0800, Aubrey Li wrote:
> On Wed, Apr 10, 2019 at 12:36 PM Aaron Lu <[email protected]> wrote:
> >
> > On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > > Now that we have accumulated quite a number of different fixes to your orginal
> > > posted patches. Would you like to post a v2 of the core scheduler with the fixes?
> >
> > One more question I'm not sure: should a task with cookie=0, i.e. tasks
> > that are untagged, be allowed to scheduled on the the same core with
> > another tagged task?
> >
> > The current patch seems to disagree on this, e.g. in pick_task(),
> > if max is already chosen but max->core_cookie == 0, then we didn't care
> > about cookie and simply use class_pick for the other cpu. This means we
> > could schedule two tasks with different cookies(one is zero and the
> > other can be tagged).
> >
> > But then sched_core_find() only allow idle task to match with any tagged
> > tasks(we didn't place untagged tasks to the core tree of course :-).
> >
> > Thoughts? Do I understand this correctly? If so, I think we probably
> > want to make this clear before v2. I personally feel, we shouldn't allow
> > untagged tasks(like kernel threads) to match with tagged tasks.
>
> Does it make sense if we take untagged tasks as hypervisor, and different
> cookie tasks as different VMs? Isolation is done between VMs, not between
> VM and hypervisor.
>
> Did you see anything harmful if an untagged task and a tagged task
> run simultaneously on the same core?

VM can see hypervisor's data then, I think.
We probably do not want that happen.

2019-04-11 03:07:26

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Wed, Apr 10, 2019 at 04:44:18PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 10, 2019 at 12:36:33PM +0800, Aaron Lu wrote:
> > On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > > Now that we have accumulated quite a number of different fixes to your orginal
> > > posted patches. Would you like to post a v2 of the core scheduler with the fixes?
> >
> > One more question I'm not sure: should a task with cookie=0, i.e. tasks
> > that are untagged, be allowed to scheduled on the the same core with
> > another tagged task?
>
> That was not meant to be possible.

Good to know this.

> > The current patch seems to disagree on this, e.g. in pick_task(),
> > if max is already chosen but max->core_cookie == 0, then we didn't care
> > about cookie and simply use class_pick for the other cpu. This means we
> > could schedule two tasks with different cookies(one is zero and the
> > other can be tagged).
>
> When core_cookie==0 we shouldn't schedule the other siblings at all.

Not even with another untagged task?

I was thinking to leave host side tasks untagged, like kernel threads,
init and other system daemons or utilities etc., and tenant tasks tagged.
Then at least two untagged tasks can be scheduled on the same core.

Kindly let me know if you see a problem with this.

> > But then sched_core_find() only allow idle task to match with any tagged
> > tasks(we didn't place untagged tasks to the core tree of course :-).
> >
> > Thoughts? Do I understand this correctly? If so, I think we probably
> > want to make this clear before v2. I personally feel, we shouldn't allow
> > untagged tasks(like kernel threads) to match with tagged tasks.
>
> Agreed, cookie should always match or idle.

Thanks a lot for the clarification.

2019-04-11 09:21:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Thu, Apr 11, 2019 at 11:05:41AM +0800, Aaron Lu wrote:
> On Wed, Apr 10, 2019 at 04:44:18PM +0200, Peter Zijlstra wrote:
> > When core_cookie==0 we shouldn't schedule the other siblings at all.
>
> Not even with another untagged task?
>
> I was thinking to leave host side tasks untagged, like kernel threads,
> init and other system daemons or utilities etc., and tenant tasks tagged.
> Then at least two untagged tasks can be scheduled on the same core.
>
> Kindly let me know if you see a problem with this.

Let me clarify; when the rq->core->core_cookie == 0, each sibling should
schedule independently.

As Julien found, there were some issues here, but the intent was:

core_cookie 0, independent scheduling
core_cookie 0->n, core scheduling
core_cookie n->0, one last core schedule to kick possibly forced idle siblings

2019-04-15 17:02:30

by Julien Desfossez

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On 10-Apr-2019 10:06:30 AM, Peter Zijlstra wrote:
> while you're all having fun playing with this, I've not yet had answers
> to the important questions of how L1TF complete we want to be and if all
> this crud actually matters one way or the other.
>
> Also, I still don't see this stuff working for high context switch rate
> workloads, and that is exactly what some people were aiming for..

We have been running scaling tests on highly loaded systems (with all
the fixes and suggestions applied) and here are the results.

On a system with 2x6 cores (12 hardware threads per NUMA node), with one
12-vcpus-32gb VM per NUMA node running a CPU-intensive workload
(linpack):
- Baseline: 864 gflops
- Core scheduling: 864 gflops
- nosmt (switch to 6 hardware threads per node): 298 gflops (-65%)

In this test, the VMs are basically alone on their own NUMA node, so
they are only competing with themselves, so for the next test we moved
the 2 VMs to the same node:
- Baseline: 340 gflops, about 586k context switches/sec
- Core scheduling: 322 gflops (-5%), about 575k context switches/sec
- nosmt: 146 gflops (-57%), about 284k context switches/sec

In terms of isolation, CPU-intensive VMs share their core with a
"foreign process" (not tagged or tagged with a different tag) less than
2% of the time (sum of the time spent with a lot of different
processes). For reference, this could add up to 60% without core
scheduling and smt on. We are working on identifying the various cases
where there is unwanted co-scheduling so we can address those.

With a more heterogeneous benchmark (MySQL benchmark with a remote
client, 1 12-vcpus MySQL VM on each NUMA node), we don’t measure any
performance degradation when there is more hardware threads available
than vcpus (same with nosmt), but when we add noise VMs (sleep(15);
collect metrics; send them over a VPN; repeat) with an overcommit ratio
of 3 vcpus to 1 hardware thread, core scheduling can have up to 25%
performance degradation, whereas nosmt has 15% impact.

So the performance impact varies depending on the type of workload, but
since the CPU-intensive workloads are the ones most impacted when we
disable SMT, this is very encouraging and is a worthwhile effort.

Thanks,

Julien

2019-04-16 13:45:04

by Aaron Lu

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> On Tue, Apr 02, 2019 at 02:46:13PM +0800, Aaron Lu wrote:
...
> > Perhaps we can test if max is on the same cpu as class_pick and then
> > use cpu_prio_less() or core_prio_less() accordingly here, or just
> > replace core_prio_less(max, p) with cpu_prio_less(max, p) in
> > pick_next_task(). The 2nd obviously breaks the comment of
> > core_prio_less() though: /* cannot compare vruntime across CPUs */.
>
> Right, so as the comment states, you cannot directly compare vruntime
> across CPUs, doing that is completely buggered.
>
> That also means that the cpu_prio_less(max, class_pick) in pick_task()
> is buggered, because there is no saying @max is on this CPU to begin
> with.

I find it difficult to decide which task of fair_sched_class having
higher priority when the two tasks belong to different CPUs.

Please see below.

> Another approach would be something like the below:
>
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
> */
>
> /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
> {
> int pa = __task_prio(a), pb = __task_prio(b);
>
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
> if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> return !dl_time_before(a->dl.deadline, b->dl.deadline);
>
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
>
> return false;
> }
>
> static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
> }
>
> static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
> {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime

(I used task_cfs_rq() instead of task_rq() above.)

Consider the following scenario:
(assume cpu0 and cpu1 are siblings of core0)

1 a cpu-intensive task belonging to cgroupA running on cpu0;
2 launch 'ls' from a shell(bash) which belongs to cgroupB;
3 'ls' blocked for a long time(if not forever).

Per my limited understanding: the launch of 'ls' cause bash to fork,
then the newly forked process' vruntime will be 6ms(probably not
precise) ahead of its cfs_rq due to START_DEBIT. Since there is no other
running task on that cfs_rq, the cfs_rq's min_vruntime doesn't have a
chance to get updated and the newly forked process will always have a
distance of 6ms compared to its cfs_rq and it will always 'lose' to the
cpu-intensive task belonging to cgroupA by core_prio_less().

No idea how to solve this...

> +
> + return __prio_less(a, b, vruntime);
> }
>
> static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)

2019-04-19 20:34:57

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.


* Subhra Mazumdar <[email protected]> wrote:

> I see similar improvement with this patch as removing the condition I
> earlier mentioned. So that's not needed. I also included the patch for the
> priority fix. For 2 DB instances, HT disabling stands at -22% for 32 users
> (from earlier emails).
>
>
> 1 DB instance
>
> users? baseline?? %idle??? core_sched %idle
> 16???? 1????????? 84?????? -4.9% 84
> 24???? 1????????? 76?????? -6.7% 75
> 32???? 1????????? 69?????? -2.4% 69
>
> 2 DB instance
>
> users? baseline?? %idle??? core_sched %idle
> 16???? 1????????? 66?????? -19.5% 69
> 24???? 1????????? 54?????? -9.8% 57
> 32???? 1????????? 42?????? -27.2%??????? 48

So HT disabling slows down the 2DB instance by -22%, while core-sched
slows it down by -27.2%?

Would it be possible to see all the results in two larger tables (1 DB
instance and 2 DB instance) so that we can compare the performance of the
3 kernel variants with each other:

- "vanilla +HT": Hyperthreading enabled, vanilla scheduler
- "vanilla -HT": Hyperthreading disabled, vanilla scheduler
- "core_sched": Hyperthreading enabled, core-scheduling enabled

?

Thanks,

Ingo

2019-04-19 23:21:49

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.


On 4/19/19 1:40 AM, Ingo Molnar wrote:
> * Subhra Mazumdar <[email protected]> wrote:
>
>> I see similar improvement with this patch as removing the condition I
>> earlier mentioned. So that's not needed. I also included the patch for the
>> priority fix. For 2 DB instances, HT disabling stands at -22% for 32 users
>> (from earlier emails).
>>
>>
>> 1 DB instance
>>
>> users  baseline   %idle    core_sched %idle
>> 16     1          84       -4.9% 84
>> 24     1          76       -6.7% 75
>> 32     1          69       -2.4% 69
>>
>> 2 DB instance
>>
>> users  baseline   %idle    core_sched %idle
>> 16     1          66       -19.5% 69
>> 24     1          54       -9.8% 57
>> 32     1          42       -27.2%        48
> So HT disabling slows down the 2DB instance by -22%, while core-sched
> slows it down by -27.2%?
>
> Would it be possible to see all the results in two larger tables (1 DB
> instance and 2 DB instance) so that we can compare the performance of the
> 3 kernel variants with each other:
>
> - "vanilla +HT": Hyperthreading enabled, vanilla scheduler
> - "vanilla -HT": Hyperthreading disabled, vanilla scheduler
> - "core_sched": Hyperthreading enabled, core-scheduling enabled
>
> ?
>
> Thanks,
>
> Ingo
Following are the numbers. Disabling HT gives improvement in some cases.

1 DB instance

users  vanilla+HT   core_sched vanilla-HT
16     1            -4.9% -11.7%
24     1            -6.7% +13.7%
32     1            -2.4% +8%

2 DB instance

users  vanilla+HT   core_sched vanilla-HT
16     1            -19.5% +5.6%
24     1            -9.8% +3.5%
32     1            -27.2%        -22.8%