Second iteration of the core-scheduling feature.
This version fixes apparent bugs and performance issues in v1. This
doesn't fully address the issue of core sharing between processes
with different tags. Core sharing still happens 1% to 5% of the time
based on the nature of workload and timing of the runnable processes.
Changes in v2
-------------
- rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
- Fixes for couple of NULL pointer dereference crashes
- Subhra Mazumdar
- Tim Chen
- Improves priority comparison logic for process in different cpus
- Peter Zijlstra
- Aaron Lu
- Fixes a hard lockup in rq locking
- Vineeth Pillai
- Julien Desfossez
- Fixes a performance issue seen on IO heavy workloads
- Vineeth Pillai
- Julien Desfossez
- Fix for 32bit build
- Aubrey Li
Issues
------
- Processes with different tags can still share the core
- A crash when disabling cpus with core-scheduling on
- https://paste.debian.net/plainh/fa6bcfa8
---
Peter Zijlstra (16):
stop_machine: Fix stop_cpus_in_progress ordering
sched: Fix kerneldoc comment for ia64_set_curr_task
sched: Wrap rq::lock access
sched/{rt,deadline}: Fix set_next_task vs pick_next_task
sched: Add task_struct pointer to sched_class::set_curr_task
sched/fair: Export newidle_balance()
sched: Allow put_prev_task() to drop rq->lock
sched: Rework pick_next_task() slow-path
sched: Introduce sched_class::pick_task()
sched: Core-wide rq->lock
sched: Basic tracking of matching tasks
sched: A quick and dirty cgroup tagging interface
sched: Add core wide task selection and scheduling.
sched/fair: Add a few assertions
sched: Trivial forced-newidle balancer
sched: Debug bits...
Vineeth Remanan Pillai (1):
sched: Wake up sibling if it has something to run
include/linux/sched.h | 9 +-
kernel/Kconfig.preempt | 7 +-
kernel/sched/core.c | 800 +++++++++++++++++++++++++++++++++++++--
kernel/sched/cpuacct.c | 12 +-
kernel/sched/deadline.c | 99 +++--
kernel/sched/debug.c | 4 +-
kernel/sched/fair.c | 137 +++++--
kernel/sched/idle.c | 42 +-
kernel/sched/pelt.h | 2 +-
kernel/sched/rt.c | 96 +++--
kernel/sched/sched.h | 185 ++++++---
kernel/sched/stop_task.c | 35 +-
kernel/sched/topology.c | 4 +-
kernel/stop_machine.c | 2 +
14 files changed, 1145 insertions(+), 289 deletions(-)
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778c48a7fda..416ea613eda8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6287,7 +6287,7 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Currently the pick_next_task() loop is convoluted and ugly because of
how it can drop the rq->lock and needs to restart the picking.
For the RT/Deadline classes, it is put_prev_task() where we do
balancing, and we could do this before the picking loop. Make this
possible.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 2 +-
kernel/sched/deadline.c | 14 +++++++++++++-
kernel/sched/fair.c | 2 +-
kernel/sched/idle.c | 2 +-
kernel/sched/rt.c | 14 +++++++++++++-
kernel/sched/sched.h | 4 ++--
kernel/sched/stop_task.c | 2 +-
7 files changed, 32 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32ea79fb8d29..9dfa0c53deb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5595,7 +5595,7 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fadfbfe7d573..56791c0318a2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1773,13 +1773,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+
+ if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f7e631e692a3..41ec5e68e1c5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7081,7 +7081,7 @@ done: __maybe_unused;
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index dd64be34881d..1b65a4c3683e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -373,7 +373,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index adec98a94f2b..51ee87c5a28a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1593,7 +1593,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_rt(rq);
@@ -1605,6 +1605,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bfcbcbb25646..4cbe2bef92e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1675,7 +1675,7 @@ struct sched_class {
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
@@ -1721,7 +1721,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
WARN_ON_ONCE(rq->curr != prev);
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
static inline void set_next_task(struct rq *rq, struct task_struct *next)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47a3d2a18a9a..8f414018d5e0 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -59,7 +59,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Marks all tasks in a cgroup as matching for core-scheduling.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 4 +++
2 files changed, 66 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5066a1493acf..e5bdc1c4d8d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6658,6 +6658,15 @@ static void sched_change_group(struct task_struct *tsk, int type)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
+
+#ifdef CONFIG_SCHED_CORE
+ if ((unsigned long)tsk->sched_task_group == tsk->core_cookie)
+ tsk->core_cookie = 0UL;
+
+ if (tg->tagged /* && !tsk->core_cookie ? */)
+ tsk->core_cookie = (unsigned long)tg;
+#endif
+
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7117,6 +7126,43 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SCHED_CORE
+static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+
+ return !!tg->tagged;
+}
+
+static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+ struct task_group *tg = css_tg(css);
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ if (val > 1)
+ return -ERANGE;
+
+ if (tg->tagged == !!val)
+ return 0;
+
+ tg->tagged = !!val;
+
+ if (!!val)
+ sched_core_get();
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it)))
+ p->core_cookie = !!val ? (unsigned long)tg : 0UL;
+ css_task_iter_end(&it);
+
+ if (!val)
+ sched_core_put();
+
+ return 0;
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -7152,6 +7198,14 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_rt_period_read_uint,
.write_u64 = cpu_rt_period_write_uint,
},
+#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "tag",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_core_tag_read_u64,
+ .write_u64 = cpu_core_tag_write_u64,
+ },
#endif
{ } /* Terminate */
};
@@ -7319,6 +7373,14 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_max_show,
.write = cpu_max_write,
},
+#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "tag",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_core_tag_read_u64,
+ .write_u64 = cpu_core_tag_write_u64,
+ },
#endif
{ } /* terminate */
};
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42dd620797d7..16fb236eab7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -363,6 +363,10 @@ struct cfs_bandwidth {
struct task_group {
struct cgroup_subsys_state css;
+#ifdef CONFIG_SCHED_CORE
+ int tagged;
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each CPU */
struct sched_entity **se;
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/fair.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 45d86b862750..08812fe7e1d3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6209,6 +6209,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
+ /*
+ * per-cpu select_idle_mask usage
+ */
+ lockdep_assert_irqs_disabled();
+
if (available_idle_cpu(target))
return target;
@@ -6636,8 +6641,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
*
* Returns the target CPU number.
- *
- * preempt must be disabled.
*/
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
@@ -6648,6 +6651,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
+ /*
+ * required for stable ->cpus_allowed
+ */
+ lockdep_assert_held(&p->pi_lock);
+
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Not-Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 38 +++++++++++++++++++++++++++++++++++++-
1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0e3c51a1b54a..e8e5f26db052 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -106,6 +106,10 @@ static inline bool __prio_less(struct task_struct *a, struct task_struct *b, boo
int pa = __task_prio(a), pb = __task_prio(b);
+ trace_printk("(%s/%d;%d,%Lu,%Lu) ?< (%s/%d;%d,%Lu,%Lu)\n",
+ a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
+ b->comm, b->pid, pa, b->se.vruntime, b->dl.deadline);
+
if (-pa < -pb)
return true;
@@ -264,6 +268,8 @@ static void __sched_core_enable(void)
static_branch_enable(&__sched_core_enabled);
stop_machine(__sched_core_stopper, (void *)true, NULL);
+
+ printk("core sched enabled\n");
}
static void __sched_core_disable(void)
@@ -272,6 +278,8 @@ static void __sched_core_disable(void)
stop_machine(__sched_core_stopper, (void *)false, NULL);
static_branch_disable(&__sched_core_enabled);
+
+ printk("core sched disabled\n");
}
void sched_core_get(void)
@@ -3706,6 +3714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
put_prev_task(rq, prev);
set_next_task(rq, next);
}
+
+ trace_printk("pick pre selected (%u %u %u): %s/%d %lx\n",
+ rq->core->core_task_seq,
+ rq->core->core_pick_seq,
+ rq->core_sched_seq,
+ next->comm, next->pid,
+ next->core_cookie);
+
return next;
}
@@ -3777,6 +3793,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
next = p;
rq->core_pick = NULL;
+ trace_printk("unconstrained pick: %s/%d %lx\n",
+ next->comm, next->pid, next->core_cookie);
+
/*
* If the sibling is idling, we might want to wake it
* so that it can check for any runnable tasks that did
@@ -3787,6 +3806,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq_j->core_pick = NULL;
if (j != cpu &&
is_idle_task(rq_j->curr) && rq_j->nr_running) {
+ trace_printk("IPI(%d->%d[%d]) idle preempt\n",
+ cpu, j, rq_j->nr_running);
resched_curr(rq_j);
}
}
@@ -3798,6 +3819,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq_i->core_pick = p;
+ trace_printk("cpu(%d): selected: %s/%d %lx\n",
+ i, p->comm, p->pid, p->core_cookie);
+
/*
* If this new candidate is of higher priority than the
* previous; and they're incompatible; we need to wipe
@@ -3812,6 +3836,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq->core->core_cookie = p->core_cookie;
max = p;
+ trace_printk("max: %s/%d %lx\n", max->comm, max->pid, max->core_cookie);
+
if (old_max && !cookie_match(old_max, p)) {
for_each_cpu(j, smt_mask) {
if (j == i)
@@ -3847,13 +3873,17 @@ next_class:;
if (i == cpu)
continue;
- if (rq_i->curr != rq_i->core_pick)
+ if (rq_i->curr != rq_i->core_pick) {
+ trace_printk("IPI(%d)\n", i);
resched_curr(rq_i);
+ }
}
rq->core_sched_seq = rq->core->core_pick_seq;
next = rq->core_pick;
+ trace_printk("picked: %s/%d %lx\n", next->comm, next->pid, next->core_cookie);
+
done:
set_next_task(rq, next);
return next;
@@ -3890,6 +3920,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ trace_printk("core fill: %s/%d (%d->%d) %d %d %lx\n",
+ p->comm, p->pid, that, this,
+ p->core_occupation, dst->idle->core_occupation, cookie);
+
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
@@ -6471,6 +6505,8 @@ int sched_cpu_starting(unsigned int cpu)
WARN_ON_ONCE(rq->core && rq->core != core_rq);
rq->core = core_rq;
}
+
+ printk("core: %d -> %d\n", cpu, cpu_of(core_rq));
#endif /* CONFIG_SCHED_CORE */
sched_rq_cpu_starting(cpu);
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
When a sibling is forced-idle to match the core-cookie; search for
matching tasks to fill the core.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 131 +++++++++++++++++++++++++++++++++++++++++-
kernel/sched/idle.c | 1 +
kernel/sched/sched.h | 6 ++
4 files changed, 138 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a4b39a28236f..1a309e8546cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -641,6 +641,7 @@ struct task_struct {
#ifdef CONFIG_SCHED_CORE
struct rb_node core_node;
unsigned long core_cookie;
+ unsigned int core_occupation;
#endif
#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e6e90c6f9b9..e8f5ec641d0a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -217,6 +217,21 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
return match;
}
+struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+ struct rb_node *node = &p->core_node;
+
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = container_of(node, struct task_struct, core_node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ return p;
+}
+
/*
* The static-key + stop-machine variable are needed such that:
*
@@ -3672,7 +3687,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
- int i, j, cpu;
+ int i, j, cpu, occ = 0;
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
@@ -3763,6 +3778,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
goto done;
}
+ if (!is_idle_task(p))
+ occ++;
+
rq_i->core_pick = p;
/*
@@ -3786,6 +3804,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
cpu_rq(j)->core_pick = NULL;
}
+ occ = 1;
goto again;
}
}
@@ -3808,6 +3827,8 @@ next_class:;
WARN_ON_ONCE(!rq_i->core_pick);
+ rq_i->core_pick->core_occupation = occ;
+
if (i == cpu)
continue;
@@ -3823,6 +3844,114 @@ next_class:;
return next;
}
+static bool try_steal_cookie(int this, int that)
+{
+ struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+ struct task_struct *p;
+ unsigned long cookie;
+ bool success = false;
+
+ local_irq_disable();
+ double_rq_lock(dst, src);
+
+ cookie = dst->core->core_cookie;
+ if (!cookie)
+ goto unlock;
+
+ if (dst->curr != dst->idle)
+ goto unlock;
+
+ p = sched_core_find(src, cookie);
+ if (p == src->idle)
+ goto unlock;
+
+ do {
+ if (p == src->core_pick || p == src->curr)
+ goto next;
+
+ if (!cpumask_test_cpu(this, &p->cpus_allowed))
+ goto next;
+
+ if (p->core_occupation > dst->idle->core_occupation)
+ goto next;
+
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ deactivate_task(src, p, 0);
+ set_task_cpu(p, this);
+ activate_task(dst, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+
+ resched_curr(dst);
+
+ success = true;
+ break;
+
+next:
+ p = sched_core_next(p, cookie);
+ } while (p);
+
+unlock:
+ double_rq_unlock(dst, src);
+ local_irq_enable();
+
+ return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+ int i;
+
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ if (i == cpu)
+ continue;
+
+ if (need_resched())
+ break;
+
+ if (try_steal_cookie(cpu, i))
+ return true;
+ }
+
+ return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ rcu_read_lock();
+ raw_spin_unlock_irq(rq_lockp(rq));
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ break;
+
+ if (need_resched())
+ break;
+
+ if (steal_cookie_task(cpu, sd))
+ break;
+ }
+ raw_spin_lock_irq(rq_lockp(rq));
+ rcu_read_unlock();
+}
+
+static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+
+void queue_core_balance(struct rq *rq)
+{
+ if (!sched_core_enabled(rq))
+ return;
+
+ if (!rq->core->core_cookie)
+ return;
+
+ if (!rq->nr_running) /* not forced idle */
+ return;
+
+ queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
#else /* !CONFIG_SCHED_CORE */
static struct task_struct *
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e7f38da60373..44decdcccba1 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -387,6 +387,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+ queue_core_balance(rq);
}
static struct task_struct *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4cfde289610d..2a5f5a6b11ae 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1013,6 +1013,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}
+extern void queue_core_balance(struct rq *rq);
+
#else /* !CONFIG_SCHED_CORE */
static inline bool sched_core_enabled(struct rq *rq)
@@ -1025,6 +1027,10 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
return &rq->__lock;
}
+static inline void queue_core_balance(struct rq *rq)
+{
+}
+
#endif /* CONFIG_SCHED_CORE */
#ifdef CONFIG_SCHED_SMT
--
2.17.1
During core scheduling, it can happen that the current rq selects a
non-tagged process while the sibling might be idling even though it
had something to run (because the sibling selected idle to match the
tagged process in previous tag matching iteration). We need to wake up
the sibling if such a situation arise.
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
---
kernel/sched/core.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8f5ec641d0a..0e3c51a1b54a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3775,6 +3775,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
next = p;
+ rq->core_pick = NULL;
+
+ /*
+ * If the sibling is idling, we might want to wake it
+ * so that it can check for any runnable tasks that did
+ * not get a chance to run due to previous task matching.
+ */
+ for_each_cpu(j, smt_mask) {
+ struct rq *rq_j = cpu_rq(j);
+ rq_j->core_pick = NULL;
+ if (j != cpu &&
+ is_idle_task(rq_j->curr) && rq_j->nr_running) {
+ resched_curr(rq_j);
+ }
+ }
goto done;
}
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Make sure the entire for loop has stop_cpus_in_progress set.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/stop_machine.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..583119e0c51c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -375,6 +375,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
*/
preempt_disable();
stop_cpus_in_progress = true;
+ barrier();
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -383,6 +384,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
+ barrier();
stop_cpus_in_progress = false;
preempt_enable();
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Instead of only selecting a local task, select a task for all SMT
siblings for every reschedule on the core (irrespective which logical
CPU does the reschedule).
NOTE: there is still potential for siblings rivalry.
NOTE: this is far too complicated; but thus far I've failed to
simplify it further.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 222 ++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 5 +-
2 files changed, 224 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5bdc1c4d8d7..9e6e90c6f9b9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3574,7 +3574,7 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -3619,6 +3619,220 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
BUG();
}
+#ifdef CONFIG_SCHED_CORE
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+ if (is_idle_task(a) || is_idle_task(b))
+ return true;
+
+ return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = 0UL;
+
+ /*
+ * We must not rely on rq->core->core_cookie here, because we fail to reset
+ * rq->core->core_cookie on new picks, such that we can detect if we need
+ * to do single vs multi rq task selection.
+ */
+
+ if (max && max->core_cookie) {
+ WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
+ cookie = max->core_cookie;
+ }
+
+ class_pick = class->pick_task(rq);
+ if (!cookie)
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+ if (!class_pick)
+ return cookie_pick;
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
+ return class_pick;
+
+ return cookie_pick;
+}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next, *max = NULL;
+ const struct sched_class *class;
+ const struct cpumask *smt_mask;
+ int i, j, cpu;
+
+ if (!sched_core_enabled(rq))
+ return __pick_next_task(rq, prev, rf);
+
+ /*
+ * If there were no {en,de}queues since we picked (IOW, the task
+ * pointers are all still valid), and we haven't scheduled the last
+ * pick yet, do so now.
+ */
+ if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+ rq->core->core_pick_seq != rq->core_sched_seq) {
+ WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+ next = rq->core_pick;
+ if (next != prev) {
+ put_prev_task(rq, prev);
+ set_next_task(rq, next);
+ }
+ return next;
+ }
+
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
+ cpu = cpu_of(rq);
+ smt_mask = cpu_smt_mask(cpu);
+
+ /*
+ * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+ *
+ * @task_seq guards the task state ({en,de}queues)
+ * @pick_seq is the @task_seq we did a selection on
+ * @sched_seq is the @pick_seq we scheduled
+ *
+ * However, preemptions can cause multiple picks on the same task set.
+ * 'Fix' this by also increasing @task_seq for every pick.
+ */
+ rq->core->core_task_seq++;
+
+ /* reset state */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ rq_i->core_pick = NULL;
+
+ if (i != cpu)
+ update_rq_clock(rq_i);
+ }
+
+ /*
+ * Try and select tasks for each sibling in decending sched_class
+ * order.
+ */
+ for_each_class(class) {
+again:
+ for_each_cpu_wrap(i, smt_mask, cpu) {
+ struct rq *rq_i = cpu_rq(i);
+ struct task_struct *p;
+
+ if (rq_i->core_pick)
+ continue;
+
+ /*
+ * If this sibling doesn't yet have a suitable task to
+ * run; ask for the most elegible task, given the
+ * highest priority task already selected for this
+ * core.
+ */
+ p = pick_task(rq_i, class, max);
+ if (!p) {
+ /*
+ * If there weren't no cookies; we don't need
+ * to bother with the other siblings.
+ */
+ if (i == cpu && !rq->core->core_cookie)
+ goto next_class;
+
+ continue;
+ }
+
+ /*
+ * Optimize the 'normal' case where there aren't any
+ * cookies and we don't need to sync up.
+ */
+ if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
+ next = p;
+ goto done;
+ }
+
+ rq_i->core_pick = p;
+
+ /*
+ * If this new candidate is of higher priority than the
+ * previous; and they're incompatible; we need to wipe
+ * the slate and start over.
+ *
+ * NOTE: this is a linear max-filter and is thus bounded
+ * in execution time.
+ */
+ if (!max || core_prio_less(max, p)) {
+ struct task_struct *old_max = max;
+
+ rq->core->core_cookie = p->core_cookie;
+ max = p;
+
+ if (old_max && !cookie_match(old_max, p)) {
+ for_each_cpu(j, smt_mask) {
+ if (j == i)
+ continue;
+
+ cpu_rq(j)->core_pick = NULL;
+ }
+ goto again;
+ }
+ }
+ }
+next_class:;
+ }
+
+ rq->core->core_pick_seq = rq->core->core_task_seq;
+
+ /*
+ * Reschedule siblings
+ *
+ * NOTE: L1TF -- at this point we're no longer running the old task and
+ * sending an IPI (below) ensures the sibling will no longer be running
+ * their task. This ensures there is no inter-sibling overlap between
+ * non-matching user state.
+ */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+
+ WARN_ON_ONCE(!rq_i->core_pick);
+
+ if (i == cpu)
+ continue;
+
+ if (rq_i->curr != rq_i->core_pick)
+ resched_curr(rq_i);
+ }
+
+ rq->core_sched_seq = rq->core->core_pick_seq;
+ next = rq->core_pick;
+
+done:
+ set_next_task(rq, next);
+ return next;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -5888,7 +6102,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- next = pick_next_task(rq, &fake_task, rf);
+ next = __pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
put_prev_task(rq, next);
@@ -6344,7 +6558,11 @@ void __init sched_init(void)
#ifdef CONFIG_SCHED_CORE
rq->core = NULL;
+ rq->core_pick = NULL;
rq->core_enabled = 0;
+ rq->core_tree = RB_ROOT;
+
+ rq->core_cookie = 0UL;
#endif
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 16fb236eab7b..4cfde289610d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -960,11 +960,15 @@ struct rq {
#ifdef CONFIG_SCHED_CORE
/* per rq */
struct rq *core;
+ struct task_struct *core_pick;
unsigned int core_enabled;
+ unsigned int core_sched_seq;
struct rb_root core_tree;
/* shared state */
unsigned int core_task_seq;
+ unsigned int core_pick_seq;
+ unsigned long core_cookie;
#endif
};
@@ -1770,7 +1774,6 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next);
}
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Introduce the basic infrastructure to have a core wide rq->lock.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/Kconfig.preempt | 7 +++-
kernel/sched/core.c | 91 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 31 ++++++++++++++
3 files changed, 128 insertions(+), 1 deletion(-)
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..02fe0bf26676 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -57,4 +57,9 @@ config PREEMPT
endchoice
config PREEMPT_COUNT
- bool
+ bool
+
+config SCHED_CORE
+ bool
+ default y
+ depends on SCHED_SMT
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b883c70674ba..2f559d706b8e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -60,6 +60,70 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/*
+ * The static-key + stop-machine variable are needed such that:
+ *
+ * spin_lock(rq_lockp(rq));
+ * ...
+ * spin_unlock(rq_lockp(rq));
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+static int __sched_core_stopper(void *data)
+{
+ bool enabled = !!(unsigned long)data;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ return 0;
+}
+
+static DEFINE_MUTEX(sched_core_mutex);
+static int sched_core_count;
+
+static void __sched_core_enable(void)
+{
+ // XXX verify there are no cookie tasks (yet)
+
+ static_branch_enable(&__sched_core_enabled);
+ stop_machine(__sched_core_stopper, (void *)true, NULL);
+}
+
+static void __sched_core_disable(void)
+{
+ // XXX verify there are no cookie tasks (left)
+
+ stop_machine(__sched_core_stopper, (void *)false, NULL);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!sched_core_count++)
+ __sched_core_enable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+void sched_core_put(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!--sched_core_count)
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -5865,6 +5929,28 @@ static void sched_rq_cpu_starting(unsigned int cpu)
int sched_cpu_starting(unsigned int cpu)
{
+#ifdef CONFIG_SCHED_CORE
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq, *core_rq = NULL;
+ int i;
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+ if (rq->core && rq->core == rq)
+ core_rq = rq;
+ }
+
+ if (!core_rq)
+ core_rq = cpu_rq(cpu);
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+
+ WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ rq->core = core_rq;
+ }
+#endif /* CONFIG_SCHED_CORE */
+
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -6091,6 +6177,11 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+ rq->core_enabled = 0;
+#endif
}
set_load_weight(&init_task, false);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a024dd80eeb3..eb38063221d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -952,6 +952,12 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ unsigned int core_enabled;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -979,11 +985,36 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+#ifdef CONFIG_SCHED_CORE
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
static inline raw_spinlock_t *rq_lockp(struct rq *rq)
{
return &rq->__lock;
}
+#endif /* CONFIG_SCHED_CORE */
+
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Introduce task_struct::core_cookie as an opaque identifier for core
scheduling. When enabled; core scheduling will only allow matching
task to be on the core; where idle matches everything.
When task_struct::core_cookie is set (and core scheduling is enabled)
these tasks are indexed in a second RB-tree, first on cookie value
then on scheduling function, such that matching task selection always
finds the most elegible match.
NOTE: *shudder* at the overhead...
NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative
is per class tracking of cookies and that just duplicates a lot of
stuff for no raisin (the 2nd copy lives in the rt-mutex PI code).
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
---
Changes in v2
-------------
- Improves the priority comparison logic between processes in
different cpus.
- Peter Zijlstra
- Aaron Lu
---
include/linux/sched.h | 8 ++-
kernel/sched/core.c | 164 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 4 ++
3 files changed, 175 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1549584a1538..a4b39a28236f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -636,10 +636,16 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+ struct sched_dl_entity dl;
+
+#ifdef CONFIG_SCHED_CORE
+ struct rb_node core_node;
+ unsigned long core_cookie;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
- struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f559d706b8e..5066a1493acf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -64,6 +64,159 @@ int sysctl_sched_rt_runtime = 950000;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class) /* trumps deadline */
+ return -2;
+
+ if (rt_prio(p->prio)) /* includes deadline */
+ return p->prio; /* [-1, 99] */
+
+ if (p->sched_class == &idle_sched_class)
+ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+// FIXME: This is copied from fair.c. Needs only single copy.
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return p->se.cfs_rq;
+}
+#else
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+ return &task_rq(p)->cfs;
+}
+#endif
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b) := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
+{
+ u64 vruntime;
+
+ int pa = __task_prio(a), pb = __task_prio(b);
+
+ if (-pa < -pb)
+ return true;
+
+ if (-pb < -pa)
+ return false;
+
+ if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+ return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+ vruntime = b->se.vruntime;
+ if (core_cmp) {
+ vruntime -= task_cfs_rq(b)->min_vruntime;
+ vruntime += task_cfs_rq(a)->min_vruntime;
+ }
+ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+ return !((s64)(a->se.vruntime - vruntime) <= 0);
+
+ return false;
+}
+
+static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ return __prio_less(a, b, false);
+}
+
+static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
+{
+ return __prio_less(a, b, true);
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+{
+ if (a->core_cookie < b->core_cookie)
+ return true;
+
+ if (a->core_cookie > b->core_cookie)
+ return false;
+
+ /* flip prio, so high prio is leftmost */
+ if (cpu_prio_less(b, a))
+ return true;
+
+ return false;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+ struct rb_node *parent, **node;
+ struct task_struct *node_task;
+
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ node = &rq->core_tree.rb_node;
+ parent = *node;
+
+ while (*node) {
+ node_task = container_of(*node, struct task_struct, core_node);
+ parent = *node;
+
+ if (__sched_core_less(p, node_task))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&p->core_node, parent, node);
+ rb_insert_color(&p->core_node, &rq->core_tree);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->core->core_task_seq++;
+
+ if (!p->core_cookie)
+ return;
+
+ rb_erase(&p->core_node, &rq->core_tree);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct rb_node *node = rq->core_tree.rb_node;
+ struct task_struct *node_task, *match;
+
+ /*
+ * The idle task always matches any cookie!
+ */
+ match = idle_sched_class.pick_task(rq);
+
+ while (node) {
+ node_task = container_of(node, struct task_struct, core_node);
+
+ if (node_task->core_cookie < cookie) {
+ node = node->rb_left;
+ } else if (node_task->core_cookie > cookie) {
+ node = node->rb_right;
+ } else {
+ match = node_task;
+ node = node->rb_left;
+ }
+ }
+
+ return match;
+}
+
/*
* The static-key + stop-machine variable are needed such that:
*
@@ -122,6 +275,11 @@ void sched_core_put(void)
mutex_unlock(&sched_core_mutex);
}
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+
#endif /* CONFIG_SCHED_CORE */
/*
@@ -826,6 +984,9 @@ static void set_load_weight(struct task_struct *p, bool update_load)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_enqueue(rq, p);
+
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -839,6 +1000,9 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (sched_core_enabled(rq))
+ sched_core_dequeue(rq, p);
+
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb38063221d0..42dd620797d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -957,6 +957,10 @@ struct rq {
/* per rq */
struct rq *core;
unsigned int core_enabled;
+ struct rb_root core_tree;
+
+ /* shared state */
+ unsigned int core_task_seq;
#endif
};
--
2.17.1
From: "Peter Zijlstra (Intel)" <[email protected]>
Because sched_class::pick_next_task() also implies
sched_class::set_next_task() (and possibly put_prev_task() and
newidle_balance) it is not state invariant. This makes it unsuitable
for remote task selection.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
---
Changes in v2
-------------
- Fixes a NULL pointer dereference crash
- Subhra Mazumdar
- Tim Chen
---
kernel/sched/deadline.c | 21 ++++++++++++++++-----
kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++---
kernel/sched/idle.c | 10 +++++++++-
kernel/sched/rt.c | 21 ++++++++++++++++-----
kernel/sched/sched.h | 2 ++
kernel/sched/stop_task.c | 21 ++++++++++++++++-----
6 files changed, 95 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 249310e68592..010234908cc0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1723,15 +1723,12 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
struct dl_rq *dl_rq;
- WARN_ON_ONCE(prev || rf);
-
dl_rq = &rq->dl;
if (unlikely(!dl_rq->dl_nr_running))
@@ -1742,7 +1739,19 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p);
+ return p;
+}
+
+static struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *p;
+
+ WARN_ON_ONCE(prev || rf);
+
+ p = pick_task_dl(rq);
+ if (p)
+ set_next_task_dl(rq, p);
return p;
}
@@ -2389,6 +2398,8 @@ const struct sched_class dl_sched_class = {
.set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
+ .pick_task = pick_task_dl,
+
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c055bad249a9..45d86b862750 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4132,7 +4132,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
- if (cfs_rq->skip == se) {
+ if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
if (se == curr) {
@@ -4150,13 +4150,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ if (left && cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;
clear_buddies(cfs_rq, se);
@@ -6937,6 +6937,37 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
set_last_buddy(se);
}
+static struct task_struct *
+pick_task_fair(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ struct sched_entity *se;
+
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ se = pick_next_entity(cfs_rq, NULL);
+
+ if (!(se || curr))
+ return NULL;
+
+ if (curr) {
+ if (se && curr->on_rq)
+ update_curr(cfs_rq);
+
+ if (!se || entity_before(curr, se))
+ se = curr;
+ }
+
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ return task_of(se);
+}
+
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -10648,6 +10679,8 @@ const struct sched_class fair_sched_class = {
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
+ .pick_task = pick_task_fair,
+
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7ece8e820b5d..e7f38da60373 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -373,6 +373,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
+static struct task_struct *
+pick_task_idle(struct rq *rq)
+{
+ return rq->idle;
+}
+
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
}
@@ -386,11 +392,12 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next)
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next = rq->idle;
+ struct task_struct *next;
if (prev)
put_prev_task(rq, prev);
+ next = pick_task_idle(rq);
set_next_task_idle(rq, next);
return next;
@@ -458,6 +465,7 @@ const struct sched_class idle_sched_class = {
.set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
+ .pick_task = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 79f2e60516ef..81557224548c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1548,20 +1548,29 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
-static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_task_rt(struct rq *rq)
{
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
- WARN_ON_ONCE(prev || rf);
-
if (!rt_rq->rt_queued)
return NULL;
p = _pick_next_task_rt(rq);
- set_next_task_rt(rq, p);
+ return p;
+}
+
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *p;
+
+ WARN_ON_ONCE(prev || rf);
+
+ p = pick_task_rt(rq);
+ if (p)
+ set_next_task_rt(rq, p);
return p;
}
@@ -2364,6 +2373,8 @@ const struct sched_class rt_sched_class = {
.set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
+ .pick_task = pick_task_rt,
+
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 460dd04e76af..a024dd80eeb3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1682,6 +1682,8 @@ struct sched_class {
void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
+ struct task_struct * (*pick_task)(struct rq *rq);
+
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7e1cee4e65b2..fb6c436cba6c 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -29,20 +29,30 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
}
static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+pick_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- WARN_ON_ONCE(prev || rf);
-
if (!stop || !task_on_rq_queued(stop))
return NULL;
- set_next_task_stop(rq, stop);
-
return stop;
}
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *p;
+
+ WARN_ON_ONCE(prev || rf);
+
+ p = pick_task_stop(rq);
+ if (p)
+ set_next_task_stop(rq, p);
+
+ return p;
+}
+
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
@@ -129,6 +139,7 @@ const struct sched_class stop_sched_class = {
.set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
+ .pick_task = pick_task_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Avoid the RETRY_TASK case in the pick_next_task() slow path.
By doing the put_prev_task() early, we get the rt/deadline pull done,
and by testing rq->nr_running we know if we need newidle_balance().
This then gives a stable state to pick a task from.
Since the fast-path is fair only; it means the other classes will
always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 19 ++++++++++++-------
kernel/sched/deadline.c | 30 ++----------------------------
kernel/sched/fair.c | 9 ++++++---
kernel/sched/idle.c | 4 +++-
kernel/sched/rt.c | 29 +----------------------------
kernel/sched/sched.h | 13 ++++++++-----
kernel/sched/stop_task.c | 3 ++-
7 files changed, 34 insertions(+), 73 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9dfa0c53deb3..b883c70674ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3363,7 +3363,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
@@ -3372,14 +3372,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-again:
+restart:
+ /*
+ * Ensure that we put DL/RT tasks before the pick loop, such that they
+ * can PULL higher prio tasks when we lower the RQ 'priority'.
+ */
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq, NULL, NULL);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 56791c0318a2..249310e68592 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1730,39 +1730,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct dl_rq *dl_rq;
- dl_rq = &rq->dl;
-
- if (need_pull_dl_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_dl_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
+ WARN_ON_ONCE(prev || rf);
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
+ dl_rq = &rq->dl;
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
- put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41ec5e68e1c5..c055bad249a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6950,7 +6950,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
@@ -7027,8 +7027,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
goto done;
simple:
#endif
-
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
do {
se = pick_next_entity(cfs_rq, NULL);
@@ -7056,6 +7056,9 @@ done: __maybe_unused;
return p;
idle:
+ if (!rf)
+ return NULL;
+
new_tasks = newidle_balance(rq, rf);
/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1b65a4c3683e..7ece8e820b5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -388,7 +388,9 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
{
struct task_struct *next = rq->idle;
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
+
set_next_task_idle(rq, next);
return next;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 51ee87c5a28a..79f2e60516ef 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1554,38 +1554,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
- if (need_pull_rt_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_rt_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
-
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_queued check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
+ WARN_ON_ONCE(prev || rf);
if (!rt_rq->rt_queued)
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
set_next_task_rt(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4cbe2bef92e4..460dd04e76af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1665,12 +1665,15 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
+ * Both @prev and @rf are optional and may be NULL, in which case the
+ * caller must already have invoked put_prev_task(rq, prev, rf).
*
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
+ * Otherwise it is the responsibility of the pick_next_task() to call
+ * put_prev_task() on the @prev task or something equivalent, IFF it
+ * returns a next task.
+ *
+ * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+ * higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 8f414018d5e0..7e1cee4e65b2 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -33,10 +33,11 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
{
struct task_struct *stop = rq->stop;
+ WARN_ON_ONCE(prev || rf);
+
if (!stop || !task_on_rq_queued(stop))
return NULL;
- put_prev_task(rq, prev);
set_next_task_stop(rq, stop);
return stop;
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
In preparation of further separating pick_next_task() and
set_curr_task() we have to pass the actual task into it, while there,
rename the thing to better pair with put_prev_task().
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 12 ++++++------
kernel/sched/deadline.c | 7 +------
kernel/sched/fair.c | 17 ++++++++++++++---
kernel/sched/idle.c | 27 +++++++++++++++------------
kernel/sched/rt.c | 7 +------
kernel/sched/sched.h | 8 +++++---
kernel/sched/stop_task.c | 17 +++++++----------
7 files changed, 49 insertions(+), 46 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6f4861ae85dc..32ea79fb8d29 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1081,7 +1081,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
@@ -3890,7 +3890,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3957,7 +3957,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4382,7 +4382,7 @@ static int __sched_setscheduler(struct task_struct *p,
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
@@ -5555,7 +5555,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -6438,7 +6438,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
- set_curr_task(rq, tsk);
+ set_next_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b8e15c7aa889..fadfbfe7d573 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1813,11 +1813,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void set_curr_task_dl(struct rq *rq)
-{
- set_next_task_dl(rq, rq->curr);
-}
-
#ifdef CONFIG_SMP
/* Only try algorithms three times */
@@ -2405,6 +2400,7 @@ const struct sched_class dl_sched_class = {
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
@@ -2415,7 +2411,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
#endif
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1ccab35ccf21..ebad19a033eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10359,9 +10359,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10632,7 +10642,9 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
+
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
@@ -10645,7 +10657,6 @@ const struct sched_class fair_sched_class = {
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 39788d3a40ec..dd64be34881d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -373,14 +373,25 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next)
+{
+ update_idle_core(rq);
+ schedstat_inc(rq->sched_goidle);
+}
+
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
+ struct task_struct *next = rq->idle;
+
put_prev_task(rq, prev);
- update_idle_core(rq);
- schedstat_inc(rq->sched_goidle);
+ set_next_task_idle(rq, next);
- return rq->idle;
+ return next;
}
/*
@@ -396,10 +407,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
raw_spin_lock_irq(rq_lockp(rq));
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -412,10 +419,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
@@ -450,13 +453,13 @@ const struct sched_class idle_sched_class = {
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 353ad960691b..adec98a94f2b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2355,11 +2355,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- set_next_task_rt(rq, rq->curr);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2381,6 +2376,7 @@ const struct sched_class rt_sched_class = {
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
@@ -2392,7 +2388,6 @@ const struct sched_class rt_sched_class = {
.switched_from = switched_from_rt,
#endif
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c4cd252dba29..fb01c77c16ff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1672,6 +1672,7 @@ struct sched_class {
struct task_struct *prev,
struct rq_flags *rf);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1686,7 +1687,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
@@ -1716,12 +1716,14 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
+ WARN_ON_ONCE(rq->curr != prev);
prev->sched_class->put_prev_task(rq, prev);
}
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- curr->sched_class->set_curr_task(rq);
+ WARN_ON_ONCE(rq->curr != next);
+ next->sched_class->set_next_task(rq, next);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..47a3d2a18a9a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,6 +23,11 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+{
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -32,8 +37,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
return NULL;
put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
+ set_next_task_stop(rq, stop);
return stop;
}
@@ -86,13 +90,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
@@ -128,13 +125,13 @@ const struct sched_class stop_sched_class = {
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
Because pick_next_task() implies set_curr_task() and some of the
details haven't matter too much, some of what _should_ be in
set_curr_task() ended up in pick_next_task, correct this.
This prepares the way for a pick_next_task() variant that does not
affect the current state; allowing remote picking.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/deadline.c | 23 ++++++++++++-----------
kernel/sched/rt.c | 27 ++++++++++++++-------------
2 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 133fbcc58ea1..b8e15c7aa889 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1695,12 +1695,21 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ if (rq->curr != p)
+ deadline_queue_push_tasks(rq);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1759,15 +1768,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = dl_task_of(dl_se);
- set_next_task(rq, p);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- deadline_queue_push_tasks(rq);
-
- if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_dl(rq, p);
return p;
}
@@ -1814,7 +1815,7 @@ static void task_fork_dl(struct task_struct *p)
static void set_curr_task_dl(struct rq *rq)
{
- set_next_task(rq, rq->curr);
+ set_next_task_dl(rq, rq->curr);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3d9db8c75d53..353ad960691b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,12 +1498,23 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ if (rq->curr != p)
+ rt_queue_push_tasks(rq);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1577,17 +1588,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = _pick_next_task_rt(rq);
- set_next_task(rq, p);
-
- rt_queue_push_tasks(rq);
-
- /*
- * If prev task was rt, put_prev_task() has already updated the
- * utilization. We only care of the case where we start to schedule a
- * rt task
- */
- if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_rt(rq, p);
return p;
}
@@ -2356,7 +2357,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
static void set_curr_task_rt(struct rq *rq)
{
- set_next_task(rq, rq->curr);
+ set_next_task_rt(rq, rq->curr);
}
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
In preparation of playing games with rq->lock, abstract the thing
using an accessor.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Vineeth Remanan Pillai <[email protected]>
Signed-off-by: Julien Desfossez <[email protected]>
---
Changes in v2
-------------
- Fixes a deadlock due in double_rq_lock and double_lock_lock
- Vineeth Pillai
- Julien Desfossez
- Fixes 32bit build.
- Aubrey Li
---
kernel/sched/core.c | 46 ++++++++---------
kernel/sched/cpuacct.c | 12 ++---
kernel/sched/deadline.c | 18 +++----
kernel/sched/debug.c | 4 +-
kernel/sched/fair.c | 40 +++++++--------
kernel/sched/idle.c | 4 +-
kernel/sched/pelt.h | 2 +-
kernel/sched/rt.c | 8 +--
kernel/sched/sched.h | 106 ++++++++++++++++++++--------------------
kernel/sched/topology.c | 4 +-
10 files changed, 123 insertions(+), 121 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 416ea613eda8..6f4861ae85dc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,12 +72,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -96,7 +96,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
/*
* move_queued_task() task_rq_lock()
*
@@ -118,7 +118,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
@@ -188,7 +188,7 @@ void update_rq_clock(struct rq *rq)
{
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -497,7 +497,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
if (test_tsk_need_resched(curr))
return;
@@ -521,10 +521,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}
#ifdef CONFIG_SMP
@@ -956,7 +956,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
@@ -1070,7 +1070,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
@@ -1203,7 +1203,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock)));
+ lockdep_is_held(rq_lockp(task_rq(p)))));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@ -1732,7 +1732,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
@@ -2123,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
WARN_ON_ONCE(p == current))
return;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
if (!raw_spin_trylock(&p->pi_lock)) {
/*
@@ -2609,10 +2609,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq_lockp(rq)->dep_map, 1, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- rq->lock.owner = next;
+ rq_lockp(rq)->owner = next;
#endif
}
@@ -2623,8 +2623,8 @@ static inline void finish_lock_switch(struct rq *rq)
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- raw_spin_unlock_irq(&rq->lock);
+ spin_acquire(&rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
+ raw_spin_unlock_irq(rq_lockp(rq));
}
/*
@@ -2698,7 +2698,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* schedule()
* preempt_disable(); // 1
* __schedule()
- * raw_spin_lock_irq(&rq->lock) // 2
+ * raw_spin_lock_irq(rq_lockp(rq)) // 2
*
* Also, see FORK_PREEMPT_COUNT.
*/
@@ -2774,7 +2774,7 @@ static void __balance_callback(struct rq *rq)
void (*func)(struct rq *rq);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
head = rq->balance_callback;
rq->balance_callback = NULL;
while (head) {
@@ -2785,7 +2785,7 @@ static void __balance_callback(struct rq *rq)
func(rq);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}
static inline void balance_callback(struct rq *rq)
@@ -5414,7 +5414,7 @@ void init_idle(struct task_struct *idle, int cpu)
unsigned long flags;
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -5451,7 +5451,7 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -6019,7 +6019,7 @@ void __init sched_init(void)
struct rq *rq;
rq = cpu_rq(i);
- raw_spin_lock_init(&rq->lock);
+ raw_spin_lock_init(&rq->__lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..78de28ebc45d 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -111,7 +111,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_lock_irq(rq_lockp(cpu_rq(cpu)));
#endif
if (index == CPUACCT_STAT_NSTATS) {
@@ -125,7 +125,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
}
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_unlock_irq(rq_lockp(cpu_rq(cpu)));
#endif
return data;
@@ -140,14 +140,14 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_lock_irq(rq_lockp(cpu_rq(cpu)));
#endif
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_unlock_irq(rq_lockp(cpu_rq(cpu)));
#endif
}
@@ -252,13 +252,13 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_lock_irq(rq_lockp(cpu_rq(cpu)));
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+ raw_spin_unlock_irq(rq_lockp(cpu_rq(cpu)));
#endif
}
seq_puts(m, "\n");
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a73e41a2016..133fbcc58ea1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,7 +80,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
@@ -93,7 +93,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->running_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
@@ -107,7 +107,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->this_bw += dl_bw;
SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
}
@@ -117,7 +117,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->this_bw;
- lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+ lockdep_assert_held(rq_lockp((rq_of_dl_rq(dl_rq))));
dl_rq->this_bw -= dl_bw;
SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
@@ -893,7 +893,7 @@ static int start_dl_timer(struct task_struct *p)
ktime_t now, act;
s64 delta;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
/*
* We want the timer to fire at the deadline, but considering
@@ -1003,9 +1003,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* If the runqueue is no longer available, migrate the
* task elsewhere. This necessarily changes rq.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ lockdep_unpin_lock(rq_lockp(rq), rf.cookie);
rq = dl_task_offline_migration(rq, p);
- rf.cookie = lockdep_pin_lock(&rq->lock);
+ rf.cookie = lockdep_pin_lock(rq_lockp(rq));
update_rq_clock(rq);
/*
@@ -1620,7 +1620,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
if (p->dl.dl_non_contending) {
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
@@ -1635,7 +1635,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8039d62ae36e..bfeed9658a83 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -515,7 +515,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (rb_first_cached(&cfs_rq->tasks_timeline))
MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
@@ -523,7 +523,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
max_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
SPLIT_NS(MIN_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40bd1e27b1b7..1ccab35ccf21 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4967,7 +4967,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -4986,7 +4986,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct task_group *tg;
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -6744,7 +6744,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
* rq->lock and can modify state directly.
*/
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_held(rq_lockp(task_rq(p)));
detach_entity_cfs_rq(&p->se);
} else {
@@ -7318,7 +7318,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));
if (p->sched_class != &fair_sched_class)
return 0;
@@ -7412,7 +7412,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));
/*
* We do not migrate tasks that are:
@@ -7490,7 +7490,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
@@ -7507,7 +7507,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
@@ -7543,7 +7543,7 @@ static int detach_tasks(struct lb_env *env)
unsigned long load;
int detached = 0;
- lockdep_assert_held(&env->src_rq->lock);
+ lockdep_assert_held(rq_lockp(env->src_rq));
if (env->imbalance <= 0)
return 0;
@@ -7624,7 +7624,7 @@ static int detach_tasks(struct lb_env *env)
*/
static void attach_task(struct rq *rq, struct task_struct *p)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -9177,7 +9177,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (need_active_balance(&env)) {
unsigned long flags;
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(busiest), flags);
/*
* Don't kick the active_load_balance_cpu_stop,
@@ -9185,7 +9185,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* moved to this_cpu:
*/
if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
+ raw_spin_unlock_irqrestore(rq_lockp(busiest),
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
@@ -9201,7 +9201,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
busiest->push_cpu = this_cpu;
active_balance = 1;
}
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(busiest), flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
@@ -9940,7 +9940,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));
/*
* This CPU is going to be idle and blocked load of idle CPUs
* need to be updated. Run the ilb locally as it is a good
@@ -9949,7 +9949,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
*/
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
+ raw_spin_lock(rq_lockp(this_rq));
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10009,7 +10009,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
goto out;
}
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));
update_blocked_averages(this_cpu);
rcu_read_lock();
@@ -10050,7 +10050,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
}
rcu_read_unlock();
- raw_spin_lock(&this_rq->lock);
+ raw_spin_lock(rq_lockp(this_rq));
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
@@ -10486,11 +10486,11 @@ void online_fair_sched_group(struct task_group *tg)
rq = cpu_rq(i);
se = tg->se[i];
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
}
}
@@ -10513,9 +10513,9 @@ void unregister_fair_sched_group(struct task_group *tg)
rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
}
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..39788d3a40ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -390,10 +390,10 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
}
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..dd604947e9f8 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -116,7 +116,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
static inline u64 rq_clock_pelt(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 90fa23d36565..3d9db8c75d53 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -845,7 +845,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -883,7 +883,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
@@ -2034,9 +2034,9 @@ void rto_push_irq_work_func(struct irq_work *work)
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
push_rt_tasks(rq);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}
raw_spin_lock(&rd->rto_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..c4cd252dba29 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -806,7 +806,7 @@ extern void rto_push_irq_work_func(struct irq_work *work);
*/
struct rq {
/* runqueue lock: */
- raw_spinlock_t lock;
+ raw_spinlock_t __lock;
/*
* nr_running and cpu_load should be in the same cacheline because
@@ -979,6 +979,10 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ return &rq->__lock;
+}
#ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq);
@@ -1046,7 +1050,7 @@ static inline void assert_clock_updated(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);
return rq->clock;
@@ -1054,7 +1058,7 @@ static inline u64 rq_clock(struct rq *rq)
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
assert_clock_updated(rq);
return rq->clock_task;
@@ -1062,7 +1066,7 @@ static inline u64 rq_clock_task(struct rq *rq)
static inline void rq_clock_skip_update(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rq->clock_update_flags |= RQCF_REQ_SKIP;
}
@@ -1072,7 +1076,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
*/
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
@@ -1091,7 +1095,7 @@ struct rq_flags {
static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rf->cookie = lockdep_pin_lock(rq_lockp(rq));
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
@@ -1106,12 +1110,12 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
rf->clock_update_flags = RQCF_UPDATED;
#endif
- lockdep_unpin_lock(&rq->lock, rf->cookie);
+ lockdep_unpin_lock(rq_lockp(rq), rf->cookie);
}
static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
- lockdep_repin_lock(&rq->lock, rf->cookie);
+ lockdep_repin_lock(rq_lockp(rq), rf->cookie);
#ifdef CONFIG_SCHED_DEBUG
/*
@@ -1132,7 +1136,7 @@ static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}
static inline void
@@ -1141,7 +1145,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(p->pi_lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
@@ -1149,7 +1153,7 @@ static inline void
rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), rf->flags);
rq_pin_lock(rq, rf);
}
@@ -1157,7 +1161,7 @@ static inline void
rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock_irq(rq_lockp(rq));
rq_pin_lock(rq, rf);
}
@@ -1165,7 +1169,7 @@ static inline void
rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
rq_pin_lock(rq, rf);
}
@@ -1173,7 +1177,7 @@ static inline void
rq_relock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
- raw_spin_lock(&rq->lock);
+ raw_spin_lock(rq_lockp(rq));
rq_repin_lock(rq, rf);
}
@@ -1182,7 +1186,7 @@ rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), rf->flags);
}
static inline void
@@ -1190,7 +1194,7 @@ rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock_irq(&rq->lock);
+ raw_spin_unlock_irq(rq_lockp(rq));
}
static inline void
@@ -1198,7 +1202,7 @@ rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(rq_lockp(rq));
}
static inline struct rq *
@@ -1261,7 +1265,7 @@ queue_balance_callback(struct rq *rq,
struct callback_head *head,
void (*func)(struct rq *rq))
{
- lockdep_assert_held(&rq->lock);
+ lockdep_assert_held(rq_lockp(rq));
if (unlikely(head->next))
return;
@@ -1917,7 +1921,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- raw_spin_unlock(&this_rq->lock);
+ raw_spin_unlock(rq_lockp(this_rq));
double_rq_lock(this_rq, busiest);
return 1;
@@ -1936,20 +1940,22 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
+ if (rq_lockp(this_rq) == rq_lockp(busiest))
+ return 0;
+
+ if (likely(raw_spin_trylock(rq_lockp(busiest))))
+ return 0;
+
+ if (rq_lockp(busiest) >= rq_lockp(this_rq)) {
+ raw_spin_lock_nested(rq_lockp(busiest), SINGLE_DEPTH_NESTING);
+ return 0;
}
- return ret;
+
+ raw_spin_unlock(rq_lockp(this_rq));
+ raw_spin_lock(rq_lockp(busiest));
+ raw_spin_lock_nested(rq_lockp(this_rq), SINGLE_DEPTH_NESTING);
+
+ return 1;
}
#endif /* CONFIG_PREEMPT */
@@ -1959,20 +1965,16 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
*/
static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work well under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
-
+ lockdep_assert_irqs_disabled();
return _double_lock_balance(this_rq, busiest);
}
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+ if (rq_lockp(this_rq) != rq_lockp(busiest))
+ raw_spin_unlock(rq_lockp(busiest));
+ lock_set_subclass(&rq_lockp(this_rq)->dep_map, 0, _RET_IP_);
}
static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
@@ -2013,16 +2015,16 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq2->lock)
{
BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
+ if (rq_lockp(rq1) == rq_lockp(rq2)) {
+ raw_spin_lock(rq_lockp(rq1));
__acquire(rq2->lock); /* Fake it out ;) */
} else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ if (rq_lockp(rq1) < rq_lockp(rq2)) {
+ raw_spin_lock(rq_lockp(rq1));
+ raw_spin_lock_nested(rq_lockp(rq2), SINGLE_DEPTH_NESTING);
} else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(rq_lockp(rq2));
+ raw_spin_lock_nested(rq_lockp(rq1), SINGLE_DEPTH_NESTING);
}
}
}
@@ -2037,9 +2039,9 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
+ raw_spin_unlock(rq_lockp(rq1));
+ if (rq_lockp(rq1) != rq_lockp(rq2))
+ raw_spin_unlock(rq_lockp(rq2));
else
__release(rq2->lock);
}
@@ -2062,7 +2064,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
{
BUG_ON(!irqs_disabled());
BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
+ raw_spin_lock(rq_lockp(rq1));
__acquire(rq2->lock); /* Fake it out ;) */
}
@@ -2077,7 +2079,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq2->lock)
{
BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
+ raw_spin_unlock(rq_lockp(rq1));
__release(rq2->lock);
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ab7f371a3a17..14b8be81dab2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -442,7 +442,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (rq->rd) {
old_rd = rq->rd;
@@ -468,7 +468,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
--
2.17.1
From: Peter Zijlstra (Intel) <[email protected]>
For pick_next_task_fair() it is the newidle balance that requires
dropping the rq->lock; provided we do put_prev_task() early, we can
also detect the condition for doing newidle early.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/fair.c | 18 ++++++++----------
kernel/sched/sched.h | 4 ++++
2 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ebad19a033eb..f7e631e692a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3611,8 +3611,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -7058,11 +7056,10 @@ done: __maybe_unused;
return p;
idle:
- update_misfit_status(NULL, rq);
- new_tasks = idle_balance(rq, rf);
+ new_tasks = newidle_balance(rq, rf);
/*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -9257,10 +9254,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
ld_moved = 0;
/*
- * idle_balance() disregards balance intervals, so we could repeatedly
- * reach this code, which would lead to balance_interval skyrocketting
- * in a short amount of time. Skip the balance_interval increase logic
- * to avoid that.
+ * newidle_balance() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketting in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9967,7 +9964,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9975,6 +9972,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
+ update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb01c77c16ff..bfcbcbb25646 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1414,10 +1414,14 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
#else
static inline void sched_ttwu_pending(void) { }
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
#endif /* CONFIG_SMP */
#include "stats.h"
--
2.17.1
Hi,
On Tue, Apr 23, 2019 at 04:18:05PM +0000 Vineeth Remanan Pillai wrote:
> Second iteration of the core-scheduling feature.
Thanks for spinning V2 of this.
>
> This version fixes apparent bugs and performance issues in v1. This
> doesn't fully address the issue of core sharing between processes
> with different tags. Core sharing still happens 1% to 5% of the time
> based on the nature of workload and timing of the runnable processes.
>
> Changes in v2
> -------------
> - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> - Fixes for couple of NULL pointer dereference crashes
> - Subhra Mazumdar
> - Tim Chen
> - Improves priority comparison logic for process in different cpus
> - Peter Zijlstra
> - Aaron Lu
> - Fixes a hard lockup in rq locking
> - Vineeth Pillai
> - Julien Desfossez
> - Fixes a performance issue seen on IO heavy workloads
> - Vineeth Pillai
> - Julien Desfossez
> - Fix for 32bit build
> - Aubrey Li
>
> Issues
> ------
> - Processes with different tags can still share the core
I may have missed something... Could you explain this statement?
This, to me, is the whole point of the patch series. If it's not
doing this then ... what?
Thanks,
Phil
> - A crash when disabling cpus with core-scheduling on
> - https://paste.debian.net/plainh/fa6bcfa8
>
> ---
>
> Peter Zijlstra (16):
> stop_machine: Fix stop_cpus_in_progress ordering
> sched: Fix kerneldoc comment for ia64_set_curr_task
> sched: Wrap rq::lock access
> sched/{rt,deadline}: Fix set_next_task vs pick_next_task
> sched: Add task_struct pointer to sched_class::set_curr_task
> sched/fair: Export newidle_balance()
> sched: Allow put_prev_task() to drop rq->lock
> sched: Rework pick_next_task() slow-path
> sched: Introduce sched_class::pick_task()
> sched: Core-wide rq->lock
> sched: Basic tracking of matching tasks
> sched: A quick and dirty cgroup tagging interface
> sched: Add core wide task selection and scheduling.
> sched/fair: Add a few assertions
> sched: Trivial forced-newidle balancer
> sched: Debug bits...
>
> Vineeth Remanan Pillai (1):
> sched: Wake up sibling if it has something to run
>
> include/linux/sched.h | 9 +-
> kernel/Kconfig.preempt | 7 +-
> kernel/sched/core.c | 800 +++++++++++++++++++++++++++++++++++++--
> kernel/sched/cpuacct.c | 12 +-
> kernel/sched/deadline.c | 99 +++--
> kernel/sched/debug.c | 4 +-
> kernel/sched/fair.c | 137 +++++--
> kernel/sched/idle.c | 42 +-
> kernel/sched/pelt.h | 2 +-
> kernel/sched/rt.c | 96 +++--
> kernel/sched/sched.h | 185 ++++++---
> kernel/sched/stop_task.c | 35 +-
> kernel/sched/topology.c | 4 +-
> kernel/stop_machine.c | 2 +
> 14 files changed, 1145 insertions(+), 289 deletions(-)
>
> --
> 2.17.1
>
--
>> - Processes with different tags can still share the core
> I may have missed something... Could you explain this statement?
> This, to me, is the whole point of the patch series. If it's not
> doing this then ... what?
What I meant was, the patch needs some more work to be accurate.
There are some race conditions where the core violation can still
happen. In our testing, we saw around 1 to 5% of the time being
shared with incompatible processes. One example of this happening
is as follows(let cpu 0 and 1 be siblings):
- cpu 0 selects a process with a cookie
- cpu 1 selects a higher priority process without cookie
- Selection process restarts for cpu 0 and it might select a
process with cookie but with lesser priority.
- Since it is lesser priority, the logic in pick_next_task
doesn't compare again for the cookie(trusts pick_task) and
proceeds.
This is one of the scenarios that we saw from traces, but there
might be other race conditions as well. Fix seems a little
involved and We are working on that.
Thanks
On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
<[email protected]> wrote:
>
> Second iteration of the core-scheduling feature.
>
> This version fixes apparent bugs and performance issues in v1. This
> doesn't fully address the issue of core sharing between processes
> with different tags. Core sharing still happens 1% to 5% of the time
> based on the nature of workload and timing of the runnable processes.
>
> Changes in v2
> -------------
> - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> - Fixes for couple of NULL pointer dereference crashes
> - Subhra Mazumdar
> - Tim Chen
Is this one missed? Or fixed with a better impl?
The boot up CPUs don't match the possible cpu map, so the not onlined
CPU rq->core are not initialized, which causes NULL pointer dereference
panic in online_fair_sched_group():
Thanks,
-Aubrey
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 85c728d..bdabf20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10492,6 +10492,10 @@ void online_fair_sched_group(struct task_group *tg)
rq = cpu_rq(i);
se = tg->se[i];
+#ifdef CONFIG_SCHED_CORE
+ if (!rq->core)
+ continue;
+#endif
raw_spin_lock_irq(rq_lockp(rq));
update_rq_clock(rq);
attach_entity_cfs_rq(se);
> - Improves priority comparison logic for process in different cpus
> - Peter Zijlstra
> - Aaron Lu
> - Fixes a hard lockup in rq locking
> - Vineeth Pillai
> - Julien Desfossez
> - Fixes a performance issue seen on IO heavy workloads
> - Vineeth Pillai
> - Julien Desfossez
> - Fix for 32bit build
> - Aubrey Li
>
> Issues
> ------
> - Processes with different tags can still share the core
> - A crash when disabling cpus with core-scheduling on
> - https://paste.debian.net/plainh/fa6bcfa8
>
> ---
>
> Peter Zijlstra (16):
> stop_machine: Fix stop_cpus_in_progress ordering
> sched: Fix kerneldoc comment for ia64_set_curr_task
> sched: Wrap rq::lock access
> sched/{rt,deadline}: Fix set_next_task vs pick_next_task
> sched: Add task_struct pointer to sched_class::set_curr_task
> sched/fair: Export newidle_balance()
> sched: Allow put_prev_task() to drop rq->lock
> sched: Rework pick_next_task() slow-path
> sched: Introduce sched_class::pick_task()
> sched: Core-wide rq->lock
> sched: Basic tracking of matching tasks
> sched: A quick and dirty cgroup tagging interface
> sched: Add core wide task selection and scheduling.
> sched/fair: Add a few assertions
> sched: Trivial forced-newidle balancer
> sched: Debug bits...
>
> Vineeth Remanan Pillai (1):
> sched: Wake up sibling if it has something to run
>
> include/linux/sched.h | 9 +-
> kernel/Kconfig.preempt | 7 +-
> kernel/sched/core.c | 800 +++++++++++++++++++++++++++++++++++++--
> kernel/sched/cpuacct.c | 12 +-
> kernel/sched/deadline.c | 99 +++--
> kernel/sched/debug.c | 4 +-
> kernel/sched/fair.c | 137 +++++--
> kernel/sched/idle.c | 42 +-
> kernel/sched/pelt.h | 2 +-
> kernel/sched/rt.c | 96 +++--
> kernel/sched/sched.h | 185 ++++++---
> kernel/sched/stop_task.c | 35 +-
> kernel/sched/topology.c | 4 +-
> kernel/stop_machine.c | 2 +
> 14 files changed, 1145 insertions(+), 289 deletions(-)
>
> --
> 2.17.1
>
On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
<[email protected]> wrote:
>
> From: Peter Zijlstra (Intel) <[email protected]>
>
> When a sibling is forced-idle to match the core-cookie; search for
> matching tasks to fill the core.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched.h | 1 +
> kernel/sched/core.c | 131 +++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/idle.c | 1 +
> kernel/sched/sched.h | 6 ++
> 4 files changed, 138 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a4b39a28236f..1a309e8546cd 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -641,6 +641,7 @@ struct task_struct {
> #ifdef CONFIG_SCHED_CORE
> struct rb_node core_node;
> unsigned long core_cookie;
> + unsigned int core_occupation;
> #endif
>
> #ifdef CONFIG_CGROUP_SCHED
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9e6e90c6f9b9..e8f5ec641d0a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -217,6 +217,21 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> return match;
> }
>
> +struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
> +{
> + struct rb_node *node = &p->core_node;
> +
> + node = rb_next(node);
> + if (!node)
> + return NULL;
> +
> + p = container_of(node, struct task_struct, core_node);
> + if (p->core_cookie != cookie)
> + return NULL;
> +
> + return p;
> +}
> +
> /*
> * The static-key + stop-machine variable are needed such that:
> *
> @@ -3672,7 +3687,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> struct task_struct *next, *max = NULL;
> const struct sched_class *class;
> const struct cpumask *smt_mask;
> - int i, j, cpu;
> + int i, j, cpu, occ = 0;
>
> if (!sched_core_enabled(rq))
> return __pick_next_task(rq, prev, rf);
> @@ -3763,6 +3778,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> goto done;
> }
>
> + if (!is_idle_task(p))
> + occ++;
> +
> rq_i->core_pick = p;
>
> /*
> @@ -3786,6 +3804,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> cpu_rq(j)->core_pick = NULL;
> }
> + occ = 1;
> goto again;
> }
> }
> @@ -3808,6 +3827,8 @@ next_class:;
>
> WARN_ON_ONCE(!rq_i->core_pick);
>
> + rq_i->core_pick->core_occupation = occ;
> +
> if (i == cpu)
> continue;
>
> @@ -3823,6 +3844,114 @@ next_class:;
> return next;
> }
>
> +static bool try_steal_cookie(int this, int that)
> +{
> + struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
> + struct task_struct *p;
> + unsigned long cookie;
> + bool success = false;
> +
try_steal_cookie() is in the loop of for_each_cpu_wrap().
The root domain could be large and we should avoid
stealing cookie if source rq has only one task or dst is really busy.
The following patch eliminated a deadlock issue on my side if I didn't
miss anything in v1. I'll double check with v2, but it at least avoids
unnecessary irq off/on and double rq lock. Especially, it avoids lock
contention that the idle cpu which is holding rq lock in the progress
of load_balance() and tries to lock rq here. I think it might be worth to
be picked up.
Thanks,
-Aubrey
---
kernel/sched/core.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 191ebf9..973a75d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3876,6 +3876,13 @@ static bool try_steal_cookie(int this, int that)
unsigned long cookie;
bool success = false;
+ /*
+ * Don't steal if src is idle or has only one runnable task,
+ * or dst has more than one runnable task
+ */
+ if (src->nr_running <= 1 || unlikely(dst->nr_running >= 1))
+ return false;
+
local_irq_disable();
double_rq_lock(dst, src);
--
2.7.4
> + local_irq_disable();
> + double_rq_lock(dst, src);
> +
> + cookie = dst->core->core_cookie;
> + if (!cookie)
> + goto unlock;
> +
> + if (dst->curr != dst->idle)
> + goto unlock;
> +
> + p = sched_core_find(src, cookie);
> + if (p == src->idle)
> + goto unlock;
> +
> + do {
> + if (p == src->core_pick || p == src->curr)
> + goto next;
> +
> + if (!cpumask_test_cpu(this, &p->cpus_allowed))
> + goto next;
> +
> + if (p->core_occupation > dst->idle->core_occupation)
> + goto next;
> +
> + p->on_rq = TASK_ON_RQ_MIGRATING;
> + deactivate_task(src, p, 0);
> + set_task_cpu(p, this);
> + activate_task(dst, p, 0);
> + p->on_rq = TASK_ON_RQ_QUEUED;
> +
> + resched_curr(dst);
> +
> + success = true;
> + break;
> +
> +next:
> + p = sched_core_next(p, cookie);
> + } while (p);
> +
> +unlock:
> + double_rq_unlock(dst, src);
> + local_irq_enable();
> +
> + return success;
> +}
> +
> +static bool steal_cookie_task(int cpu, struct sched_domain *sd)
> +{
> + int i;
> +
> + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
> + if (i == cpu)
> + continue;
> +
> + if (need_resched())
> + break;
> +
> + if (try_steal_cookie(cpu, i))
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static void sched_core_balance(struct rq *rq)
> +{
> + struct sched_domain *sd;
> + int cpu = cpu_of(rq);
> +
> + rcu_read_lock();
> + raw_spin_unlock_irq(rq_lockp(rq));
> + for_each_domain(cpu, sd) {
> + if (!(sd->flags & SD_LOAD_BALANCE))
> + break;
> +
> + if (need_resched())
> + break;
> +
> + if (steal_cookie_task(cpu, sd))
> + break;
> + }
> + raw_spin_lock_irq(rq_lockp(rq));
> + rcu_read_unlock();
> +}
> +
> +static DEFINE_PER_CPU(struct callback_head, core_balance_head);
> +
> +void queue_core_balance(struct rq *rq)
> +{
> + if (!sched_core_enabled(rq))
> + return;
> +
> + if (!rq->core->core_cookie)
> + return;
> +
> + if (!rq->nr_running) /* not forced idle */
> + return;
> +
> + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
> +}
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static struct task_struct *
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index e7f38da60373..44decdcccba1 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -387,6 +387,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next)
> {
> update_idle_core(rq);
> schedstat_inc(rq->sched_goidle);
> + queue_core_balance(rq);
> }
>
> static struct task_struct *
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 4cfde289610d..2a5f5a6b11ae 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1013,6 +1013,8 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
> return &rq->__lock;
> }
>
> +extern void queue_core_balance(struct rq *rq);
> +
> #else /* !CONFIG_SCHED_CORE */
>
> static inline bool sched_core_enabled(struct rq *rq)
> @@ -1025,6 +1027,10 @@ static inline raw_spinlock_t *rq_lockp(struct rq *rq)
> return &rq->__lock;
> }
>
> +static inline void queue_core_balance(struct rq *rq)
> +{
> +}
> +
> #endif /* CONFIG_SCHED_CORE */
>
> #ifdef CONFIG_SCHED_SMT
> --
> 2.17.1
>
On 4/23/19 9:18 AM, Vineeth Remanan Pillai wrote:
> +/* real prio, less is less */
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
> +{
> + u64 vruntime;
> +
> + int pa = __task_prio(a), pb = __task_prio(b);
> +
> + if (-pa < -pb)
> + return true;
> +
> + if (-pb < -pa)
> + return false;
> +
> + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> + return !dl_time_before(a->dl.deadline, b->dl.deadline);
> +
> + vruntime = b->se.vruntime;
> + if (core_cmp) {
> + vruntime -= task_cfs_rq(b)->min_vruntime;
> + vruntime += task_cfs_rq(a)->min_vruntime;
> + }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) <= 0);
> +
> + return false;
> +}
> +
> +static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + return __prio_less(a, b, false);
> +}
> +
> +static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
> +{
> + return __prio_less(a, b, true);
> +}
> +
> +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
> +{
> + if (a->core_cookie < b->core_cookie)
> + return true;
> +
> + if (a->core_cookie > b->core_cookie)
> + return false;
> +
> + /* flip prio, so high prio is leftmost */
> + if (cpu_prio_less(b, a))
> + return true;
> +
> + return false;
> +}
> +
A minor nitpick. I find keeping the vruntime base readjustment in
core_prio_less probably is more straight forward rather than pass a
core_cmp bool around.
Tim
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 455e7ecc2f48..5917fb85669b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -100,15 +87,13 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
+static inline bool __prio_less(struct task_struct *a, struct task_struct *b, u64 vruntime)
{
- u64 vruntime;
-
int pa = __task_prio(a), pb = __task_prio(b);
trace_printk("(%s/%d;%d,%Lu,%Lu) ?< (%s/%d;%d,%Lu,%Lu)\n",
- a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
- b->comm, b->pid, pa, b->se.vruntime, b->dl.deadline);
+ a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
+ b->comm, b->pid, pa, b->se.vruntime, b->dl.deadline);
if (-pa < -pb)
return true;
@@ -119,11 +104,6 @@ static inline bool __prio_less(struct task_struct *a, struct task_struct *b, boo
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
- vruntime = b->se.vruntime;
- if (core_cmp) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return !((s64)(a->se.vruntime - vruntime) <= 0);
@@ -132,12 +112,17 @@ static inline bool __prio_less(struct task_struct *a, struct task_struct *b, boo
static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
{
- return __prio_less(a, b, false);
+ return __prio_less(a, b, b->se.vruntime);
}
static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
{
- return __prio_less(a, b, true);
+ u64 vruntime = b->se.vruntime;
+
+ vruntime -= task_cfs_rq(b)->min_vruntime;
+ vruntime += task_cfs_rq(a)->min_vruntime;
+
+ return __prio_less(a, b, vruntime);
}
static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
> +
> +void sched_core_enqueue(struct rq *rq, struct task_struct *p)
> +{
...
> +}
> +
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +{
...
> +}
> +
> +/*
> + * Find left-most (aka, highest priority) task matching @cookie.
> + */
> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> +{
...
The sched_core_* functions are used only in the core.c
they are declared in. We can convert them to static functions.
Thanks.
Tim
---
kernel/sched/core.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 14e766d0df99..455e7ecc2f48 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -155,7 +155,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
}
-void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
struct rb_node *parent, **node;
struct task_struct *node_task;
@@ -182,7 +182,7 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_insert_color(&p->core_node, &rq->core_tree);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
{
rq->core->core_task_seq++;
@@ -195,7 +195,7 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p)
/*
* Find left-most (aka, highest priority) task matching @cookie.
*/
-struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
{
struct rb_node *node = rq->core_tree.rb_node;
struct task_struct *node_task, *match;
@@ -221,7 +221,7 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
return match;
}
-struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
@@ -282,7 +282,7 @@ static void __sched_core_disable(void)
printk("core sched disabled\n");
}
-void sched_core_get(void)
+static void sched_core_get(void)
{
mutex_lock(&sched_core_mutex);
if (!sched_core_count++)
@@ -290,7 +290,7 @@ void sched_core_get(void)
mutex_unlock(&sched_core_mutex);
}
-void sched_core_put(void)
+static void sched_core_put(void)
{
mutex_lock(&sched_core_mutex);
if (!--sched_core_count)
--
2.20.1
On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
<[email protected]> wrote:
>
> Second iteration of the core-scheduling feature.
>
> This version fixes apparent bugs and performance issues in v1. This
> doesn't fully address the issue of core sharing between processes
> with different tags. Core sharing still happens 1% to 5% of the time
> based on the nature of workload and timing of the runnable processes.
>
> Changes in v2
> -------------
> - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
Thanks to post v2, based on this version, here is my benchmarks result.
Environment setup
--------------------------
Skylake server, 2 numa nodes, 104 CPUs (HT on)
cgroup1 workload, sysbench (CPU intensive non AVX workload)
cgroup2 workload, gemmbench (AVX512 workload)
Case 1: task number < CPU num
--------------------------------------------
36 sysbench threads in cgroup1
36 gemmbench threads in cgroup2
core sched off:
- sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
core sched on:
- sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
Due to core cookie matching, sysbench tasks won't be affect by AVX512
tasks, latency has ~28% improvement!!!
Case 2: task number > CPU number
-------------------------------------------------
72 sysbench threads in cgroup1
72 gemmbench threads in cgroup2
core sched off:
- sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
core sched on:
- sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
So not only power, now security and performance is a pair of contradictions.
Due to core cookie not matching and forced idle introduced, latency has ~12%
regression.
Any comments?
Thanks,
-Aubrey
> try_steal_cookie() is in the loop of for_each_cpu_wrap().
> The root domain could be large and we should avoid
> stealing cookie if source rq has only one task or dst is really busy.
>
> The following patch eliminated a deadlock issue on my side if I didn't
> miss anything in v1. I'll double check with v2, but it at least avoids
> unnecessary irq off/on and double rq lock. Especially, it avoids lock
> contention that the idle cpu which is holding rq lock in the progress
> of load_balance() and tries to lock rq here. I think it might be worth to
> be picked up.
>
The dst->nr_running is actually checked in queue_core_balance with the
lock held. Also, try_steal_cookie checks if dst is running idle, but
under the lock. Checking whether src is empty makes sense, but shouldn't
it be called under the rq lock? Couple of safety and performance checks
are done before calling try_steal_cookie and hence, I hope double lock
would not cause a major performance issue.
If the hard lockup is reproducible with v2, could you please share more
details about the lockup?
Thanks
If the hard lockup is reproducible with v2, could you please share more
details about the lockup?
Thanks
On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> <[email protected]> wrote:
> >
> > Second iteration of the core-scheduling feature.
> >
> > This version fixes apparent bugs and performance issues in v1. This
> > doesn't fully address the issue of core sharing between processes
> > with different tags. Core sharing still happens 1% to 5% of the time
> > based on the nature of workload and timing of the runnable processes.
> >
> > Changes in v2
> > -------------
> > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
>
> Thanks to post v2, based on this version, here is my benchmarks result.
>
> Environment setup
> --------------------------
> Skylake server, 2 numa nodes, 104 CPUs (HT on)
> cgroup1 workload, sysbench (CPU intensive non AVX workload)
> cgroup2 workload, gemmbench (AVX512 workload)
>
> Case 1: task number < CPU num
> --------------------------------------------
> 36 sysbench threads in cgroup1
> 36 gemmbench threads in cgroup2
>
> core sched off:
> - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> core sched on:
> - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
>
> Due to core cookie matching, sysbench tasks won't be affect by AVX512
> tasks, latency has ~28% improvement!!!
>
> Case 2: task number > CPU number
> -------------------------------------------------
> 72 sysbench threads in cgroup1
> 72 gemmbench threads in cgroup2
>
> core sched off:
> - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> core sched on:
> - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
>
> So not only power, now security and performance is a pair of contradictions.
> Due to core cookie not matching and forced idle introduced, latency has ~12%
> regression.
>
> Any comments?
Would it be possible to post the results with HT off as well ?
Thanks,
Julien
> Is this one missed? Or fixed with a better impl?
>
> The boot up CPUs don't match the possible cpu map, so the not onlined
> CPU rq->core are not initialized, which causes NULL pointer dereference
> panic in online_fair_sched_group():
>
Thanks for pointing this out. I think the ideal fix would be to
correctly initialize/cleanup the coresched attributes in the cpu
hotplug code path so that lock could be taken successfully if the
sibling is offlined/onlined after coresched was enabled. We are
working on another bug related to hotplugpath and shall introduce
the fix in v3.
Thanks
> A minor nitpick. I find keeping the vruntime base readjustment in
> core_prio_less probably is more straight forward rather than pass a
> core_cmp bool around.
The reason I moved the vruntime base adjustment to __prio_less is
because, the vruntime seemed alien to __prio_less when looked as
a standalone function.
I do not have a strong opinion on both. Probably a better approach
would be to replace both cpu_prio_less/core_prio_less with prio_less
which takes the third arguement 'bool on_same_rq'?
Thanks
> The sched_core_* functions are used only in the core.c
> they are declared in. We can convert them to static functions.
Thanks for pointing this out, will accomodate this in v3.
Thanks
On Wed, Apr 24, 2019 at 10:00 PM Julien Desfossez
<[email protected]> wrote:
>
> On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> > On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> > <[email protected]> wrote:
> > >
> > > Second iteration of the core-scheduling feature.
> > >
> > > This version fixes apparent bugs and performance issues in v1. This
> > > doesn't fully address the issue of core sharing between processes
> > > with different tags. Core sharing still happens 1% to 5% of the time
> > > based on the nature of workload and timing of the runnable processes.
> > >
> > > Changes in v2
> > > -------------
> > > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> >
> > Thanks to post v2, based on this version, here is my benchmarks result.
> >
> > Environment setup
> > --------------------------
> > Skylake server, 2 numa nodes, 104 CPUs (HT on)
> > cgroup1 workload, sysbench (CPU intensive non AVX workload)
> > cgroup2 workload, gemmbench (AVX512 workload)
> >
> > Case 1: task number < CPU num
> > --------------------------------------------
> > 36 sysbench threads in cgroup1
> > 36 gemmbench threads in cgroup2
> >
> > core sched off:
> > - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> > core sched on:
> > - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
> >
> > Due to core cookie matching, sysbench tasks won't be affect by AVX512
> > tasks, latency has ~28% improvement!!!
> >
> > Case 2: task number > CPU number
> > -------------------------------------------------
> > 72 sysbench threads in cgroup1
> > 72 gemmbench threads in cgroup2
> >
> > core sched off:
> > - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> > core sched on:
> > - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
> >
> > So not only power, now security and performance is a pair of contradictions.
> > Due to core cookie not matching and forced idle introduced, latency has ~12%
> > regression.
> >
> > Any comments?
>
> Would it be possible to post the results with HT off as well ?
What's the point here to turn HT off? The latency is sensitive to the
relationship
between the task number and CPU number. Usually less CPU number, more run
queue wait time, and worse result.
Thanks,
-Aubrey
On 4/24/19 1:43 PM, Vineeth Remanan Pillai wrote:
>> A minor nitpick. I find keeping the vruntime base readjustment in
>> core_prio_less probably is more straight forward rather than pass a
>> core_cmp bool around.
>
> The reason I moved the vruntime base adjustment to __prio_less is
> because, the vruntime seemed alien to __prio_less when looked as
> a standalone function.
Doing the adjustment in core_prio_less
will save us an extra "if" comparison. I'm fine either way.
Thx.
Tim
>
> I do not have a strong opinion on both. Probably a better approach
> would be to replace both cpu_prio_less/core_prio_less with prio_less
> which takes the third arguement 'bool on_same_rq'?
>
> Thanks
>
* Aubrey Li <[email protected]> wrote:
> On Wed, Apr 24, 2019 at 10:00 PM Julien Desfossez
> <[email protected]> wrote:
> >
> > On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> > > On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> > > <[email protected]> wrote:
> > > >
> > > > Second iteration of the core-scheduling feature.
> > > >
> > > > This version fixes apparent bugs and performance issues in v1. This
> > > > doesn't fully address the issue of core sharing between processes
> > > > with different tags. Core sharing still happens 1% to 5% of the time
> > > > based on the nature of workload and timing of the runnable processes.
> > > >
> > > > Changes in v2
> > > > -------------
> > > > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> > >
> > > Thanks to post v2, based on this version, here is my benchmarks result.
> > >
> > > Environment setup
> > > --------------------------
> > > Skylake server, 2 numa nodes, 104 CPUs (HT on)
> > > cgroup1 workload, sysbench (CPU intensive non AVX workload)
> > > cgroup2 workload, gemmbench (AVX512 workload)
> > >
> > > Case 1: task number < CPU num
> > > --------------------------------------------
> > > 36 sysbench threads in cgroup1
> > > 36 gemmbench threads in cgroup2
> > >
> > > core sched off:
> > > - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> > > core sched on:
> > > - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
> > >
> > > Due to core cookie matching, sysbench tasks won't be affect by AVX512
> > > tasks, latency has ~28% improvement!!!
> > >
> > > Case 2: task number > CPU number
> > > -------------------------------------------------
> > > 72 sysbench threads in cgroup1
> > > 72 gemmbench threads in cgroup2
> > >
> > > core sched off:
> > > - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> > > core sched on:
> > > - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
> > >
> > > So not only power, now security and performance is a pair of contradictions.
> > > Due to core cookie not matching and forced idle introduced, latency has ~12%
> > > regression.
> > >
> > > Any comments?
> >
> > Would it be possible to post the results with HT off as well ?
>
> What's the point here to turn HT off? The latency is sensitive to the
> relationship
> between the task number and CPU number. Usually less CPU number, more run
> queue wait time, and worse result.
HT-off numbers are mandatory: turning HT off is by far the simplest way
to solve the security bugs in these CPUs.
Any core-scheduling solution *must* perform better than HT-off for all
relevant workloads, otherwise what's the point?
Thanks,
Ingo
On Wed, Apr 24, 2019 at 08:43:36PM +0000 Vineeth Remanan Pillai wrote:
> > A minor nitpick. I find keeping the vruntime base readjustment in
> > core_prio_less probably is more straight forward rather than pass a
> > core_cmp bool around.
>
> The reason I moved the vruntime base adjustment to __prio_less is
> because, the vruntime seemed alien to __prio_less when looked as
> a standalone function.
>
> I do not have a strong opinion on both. Probably a better approach
> would be to replace both cpu_prio_less/core_prio_less with prio_less
> which takes the third arguement 'bool on_same_rq'?
>
Fwiw, I find the two names easier to read than a boolean flag. Could still
be wrapped to a single implementation I suppose.
An enum to control cpu or core would be more readable, but probably overkill...
Cheers,
Phil
> Thanks
--
On Thu, Apr 25, 2019 at 11:55:08AM +0200, Ingo Molnar wrote:
> > > Would it be possible to post the results with HT off as well ?
> >
> > What's the point here to turn HT off? The latency is sensitive to the
> > relationship
> > between the task number and CPU number. Usually less CPU number, more run
> > queue wait time, and worse result.
>
> HT-off numbers are mandatory: turning HT off is by far the simplest way
> to solve the security bugs in these CPUs.
>
> Any core-scheduling solution *must* perform better than HT-off for all
> relevant workloads, otherwise what's the point?
>
I agree. Not only should HT-off be evaluated but it should properly
evaluate for different levels of machine utilisation to get a complete
picture.
Around the same time this was first posted and because of kernel
warnings from L1TF, I did a preliminary evaluation of HT On vs HT Off
using nosmt -- this is sub-optimal in itself but it was convenient. The
conventional wisdom that HT gets a 30% boost appears to be primarily based
on academic papers evaluating HPC workloads on a Pentium 4 with a focus
on embarassingly parallel problems which is the ideal case for HT but not
the universal case. The conventional wisdom is questionable at best. The
only modern comparisons I could find were focused on games primarily
which I think hit scaling limits before HT is a factor in some cases.
I don't have the data in a format that can be present everything in a clear
format but here is an attempt anyway. This is long but the central point
that when when a machine is lightly loaded, HT Off generally performs
better than HT On and even when heavily utilised, it's still not a
guaranteed loss. I only suggest reading after this if you have coffee
and time. Ideally all this would be updated with a comparison to core
scheduling but I may not get it queued on my test grid before I leave
for LSF/MM and besides, the authors pushing this feature should be able
to provide supporting data justifying the complexity of the series.
Here is a tbench comparison scaling from a low thread count to a high
thread count. I picked tbench because it's relatively uncomplicated and
tends to be reasonable at spotting scheduler regressions. The kernel
version is old but for the purposes of this discussion, it doesn't matter
1-socket Skylake (8 logical CPUs HT On, 4 logical CPUs HT Off)
smt nosmt
Hmean 1 484.00 ( 0.00%) 519.95 * 7.43%*
Hmean 2 925.02 ( 0.00%) 1022.28 * 10.51%*
Hmean 4 1730.34 ( 0.00%) 2029.81 * 17.31%*
Hmean 8 2883.57 ( 0.00%) 2040.89 * -29.22%*
Hmean 16 2830.61 ( 0.00%) 2039.74 * -27.94%*
Hmean 32 2855.54 ( 0.00%) 2042.70 * -28.47%*
Stddev 1 1.16 ( 0.00%) 0.62 ( 46.43%)
Stddev 2 1.31 ( 0.00%) 1.00 ( 23.32%)
Stddev 4 4.89 ( 0.00%) 12.86 (-163.14%)
Stddev 8 4.30 ( 0.00%) 2.53 ( 40.99%)
Stddev 16 3.38 ( 0.00%) 5.92 ( -75.08%)
Stddev 32 5.47 ( 0.00%) 14.28 (-160.77%)
Note that disabling HT performs better when cores are available but hits
scaling limits past 4 CPUs when the machine is saturated with HT off.
It's similar with 2 sockets
2-socket Broadwell (80 logical CPUs HT On, 40 logical CPUs HT Off)
smt nosmt
Hmean 1 514.28 ( 0.00%) 540.90 * 5.18%*
Hmean 2 982.19 ( 0.00%) 1042.98 * 6.19%*
Hmean 4 1820.02 ( 0.00%) 1943.38 * 6.78%*
Hmean 8 3356.73 ( 0.00%) 3655.92 * 8.91%*
Hmean 16 6240.53 ( 0.00%) 7057.57 * 13.09%*
Hmean 32 10584.60 ( 0.00%) 15934.82 * 50.55%*
Hmean 64 24967.92 ( 0.00%) 21103.79 * -15.48%*
Hmean 128 27106.28 ( 0.00%) 20822.46 * -23.18%*
Hmean 256 28345.15 ( 0.00%) 21625.67 * -23.71%*
Hmean 320 28358.54 ( 0.00%) 21768.70 * -23.24%*
Stddev 1 2.10 ( 0.00%) 3.44 ( -63.59%)
Stddev 2 2.46 ( 0.00%) 4.83 ( -95.91%)
Stddev 4 7.57 ( 0.00%) 6.14 ( 18.86%)
Stddev 8 6.53 ( 0.00%) 11.80 ( -80.79%)
Stddev 16 11.23 ( 0.00%) 16.03 ( -42.74%)
Stddev 32 18.99 ( 0.00%) 22.04 ( -16.10%)
Stddev 64 10.86 ( 0.00%) 14.31 ( -31.71%)
Stddev 128 25.10 ( 0.00%) 16.08 ( 35.93%)
Stddev 256 29.95 ( 0.00%) 71.39 (-138.36%)
Same -- performance is better until the machine gets saturated and
disabling HT hits scaling limits earlier.
The workload "mutilate" is a load generator for memcached that is meant
to simulate a workload interesting to Facebook.
1-socket
Hmean 1 28570.67 ( 0.00%) 31632.92 * 10.72%*
Hmean 3 76904.93 ( 0.00%) 89644.73 * 16.57%*
Hmean 5 107487.40 ( 0.00%) 93418.09 * -13.09%*
Hmean 7 103066.62 ( 0.00%) 79843.72 * -22.53%*
Hmean 8 103921.65 ( 0.00%) 76378.18 * -26.50%*
Stddev 1 112.37 ( 0.00%) 261.61 (-132.82%)
Stddev 3 272.29 ( 0.00%) 641.41 (-135.56%)
Stddev 5 406.75 ( 0.00%) 1240.15 (-204.89%)
Stddev 7 2402.02 ( 0.00%) 1336.68 ( 44.35%)
Stddev 8 1139.90 ( 0.00%) 393.56 ( 65.47%)
2-socket
Hmean 1 24571.95 ( 0.00%) 24891.45 ( 1.30%)
Hmean 4 106963.43 ( 0.00%) 103955.79 ( -2.81%)
Hmean 7 154328.47 ( 0.00%) 169782.56 * 10.01%*
Hmean 12 235108.36 ( 0.00%) 236544.96 ( 0.61%)
Hmean 21 238619.16 ( 0.00%) 234542.88 * -1.71%*
Hmean 30 240198.02 ( 0.00%) 237758.38 ( -1.02%)
Hmean 48 212573.72 ( 0.00%) 172633.74 * -18.79%*
Hmean 79 140937.97 ( 0.00%) 112915.07 * -19.88%*
Hmean 80 134204.84 ( 0.00%) 116904.93 ( -12.89%)
Stddev 1 40.95 ( 0.00%) 284.57 (-594.84%)
Stddev 4 7556.84 ( 0.00%) 2176.60 ( 71.20%)
Stddev 7 10279.89 ( 0.00%) 3510.15 ( 65.85%)
Stddev 12 2534.03 ( 0.00%) 1513.61 ( 40.27%)
Stddev 21 1118.59 ( 0.00%) 1662.31 ( -48.61%)
Stddev 30 3540.20 ( 0.00%) 2056.37 ( 41.91%)
Stddev 48 24206.00 ( 0.00%) 6247.74 ( 74.19%)
Stddev 79 21650.80 ( 0.00%) 5395.35 ( 75.08%)
Stddev 80 26769.15 ( 0.00%) 5665.14 ( 78.84%)
Less clear-cut. Performance is better with HT off on Skylake but similar
until the machine is saturated on Broadwell.
With pgbench running a read-only workload we see
2-socket
Hmean 1 13226.78 ( 0.00%) 14971.99 * 13.19%*
Hmean 6 39820.61 ( 0.00%) 35036.50 * -12.01%*
Hmean 12 66707.55 ( 0.00%) 61403.63 * -7.95%*
Hmean 22 108748.16 ( 0.00%) 110223.97 * 1.36%*
Hmean 30 121964.05 ( 0.00%) 121837.03 ( -0.10%)
Hmean 48 121530.97 ( 0.00%) 117855.86 * -3.02%*
Hmean 80 116034.43 ( 0.00%) 121826.25 * 4.99%*
Hmean 110 125441.59 ( 0.00%) 122180.19 * -2.60%*
Hmean 142 117908.18 ( 0.00%) 117531.41 ( -0.32%)
Hmean 160 119343.50 ( 0.00%) 115725.11 * -3.03%*
Mix of results -- single client is better, 6 and 12 clients regressed for
some reason and after that, it's mostly flat. Hence, HT for this database
load makes very little difference because the performance limits are not
based on CPUs being immediately available.
SpecJBB 2005 is ancient but it does lend itself to easily scaling the
number of active tasks so here is a sample of the performance as
utilisation ramped up to saturation
2-socket
Hmean tput-1 48655.00 ( 0.00%) 48762.00 * 0.22%*
Hmean tput-8 387341.00 ( 0.00%) 390062.00 * 0.70%*
Hmean tput-15 660993.00 ( 0.00%) 659832.00 * -0.18%*
Hmean tput-22 916898.00 ( 0.00%) 913570.00 * -0.36%*
Hmean tput-29 1178601.00 ( 0.00%) 1169843.00 * -0.74%*
Hmean tput-36 1292377.00 ( 0.00%) 1387003.00 * 7.32%*
Hmean tput-43 1458913.00 ( 0.00%) 1508172.00 * 3.38%*
Hmean tput-50 1411975.00 ( 0.00%) 1513536.00 * 7.19%*
Hmean tput-57 1417937.00 ( 0.00%) 1495513.00 * 5.47%*
Hmean tput-64 1396242.00 ( 0.00%) 1477433.00 * 5.81%*
Hmean tput-71 1349055.00 ( 0.00%) 1472856.00 * 9.18%*
Hmean tput-78 1265738.00 ( 0.00%) 1453846.00 * 14.86%*
Hmean tput-79 1307367.00 ( 0.00%) 1446572.00 * 10.65%*
Hmean tput-80 1309718.00 ( 0.00%) 1449384.00 * 10.66%*
This was the most surprising result -- HT off was generally a benefit
even when the counts were higher than the available CPUs and I'm not
sure why. It's also interesting with HT off that the chances of keeping
a workload local to a node are reduced as a socket gets saturated earlier
but the load balancer is generally moving tasks around and NUMA Balancing
is also in play. Still, it shows that disabling HT is not a universal loss.
netperf is inherently about two tasks. For UDP_STREAM, it shows almost
no difference and it's within noise. TCP_STREAM was interesting
Hmean 64 1154.23 ( 0.00%) 1162.69 * 0.73%*
Hmean 128 2194.67 ( 0.00%) 2230.90 * 1.65%*
Hmean 256 3867.89 ( 0.00%) 3929.99 * 1.61%*
Hmean 1024 12714.52 ( 0.00%) 12913.81 * 1.57%*
Hmean 2048 21141.11 ( 0.00%) 21266.89 ( 0.59%)
Hmean 3312 27945.71 ( 0.00%) 28354.82 ( 1.46%)
Hmean 4096 30594.24 ( 0.00%) 30666.15 ( 0.24%)
Hmean 8192 37462.58 ( 0.00%) 36901.45 ( -1.50%)
Hmean 16384 42947.02 ( 0.00%) 43565.98 * 1.44%*
Stddev 64 2.21 ( 0.00%) 4.02 ( -81.62%)
Stddev 128 18.45 ( 0.00%) 11.11 ( 39.79%)
Stddev 256 30.84 ( 0.00%) 22.10 ( 28.33%)
Stddev 1024 141.46 ( 0.00%) 56.54 ( 60.03%)
Stddev 2048 200.39 ( 0.00%) 75.56 ( 62.29%)
Stddev 3312 411.11 ( 0.00%) 286.97 ( 30.20%)
Stddev 4096 299.86 ( 0.00%) 322.44 ( -7.53%)
Stddev 8192 418.80 ( 0.00%) 635.63 ( -51.77%)
Stddev 16384 661.57 ( 0.00%) 206.73 ( 68.75%)
The performance difference is marginal but variance is much reduced
by disabling HT. Now, it's important to note that this particular test
did not control for c-states and it did not bind tasks so there are a
lot of potential sources of noise. I didn't control for them because
I don't think many normal users would properly take concerns like that
into account. MMtests is able to control for those factors so it could
be independently checked.
hackbench is the most obvious loser. This is for processes communicating
via pipes.
Amean 1 0.7343 ( 0.00%) 1.1377 * -54.93%*
Amean 4 1.1647 ( 0.00%) 2.1543 * -84.97%*
Amean 7 1.6770 ( 0.00%) 3.1300 * -86.64%*
Amean 12 2.4500 ( 0.00%) 4.6447 * -89.58%*
Amean 21 3.9927 ( 0.00%) 6.8250 * -70.94%*
Amean 30 5.5320 ( 0.00%) 8.6433 * -56.24%*
Amean 48 8.4723 ( 0.00%) 12.1890 * -43.87%*
Amean 79 12.3760 ( 0.00%) 17.8347 * -44.11%*
Amean 110 16.0257 ( 0.00%) 23.1373 * -44.38%*
Amean 141 20.7070 ( 0.00%) 29.8537 * -44.17%*
Amean 172 25.1507 ( 0.00%) 37.4830 * -49.03%*
Amean 203 28.5303 ( 0.00%) 43.5220 * -52.55%*
Amean 234 33.8233 ( 0.00%) 51.5403 * -52.38%*
Amean 265 37.8703 ( 0.00%) 58.1860 * -53.65%*
Amean 296 43.8303 ( 0.00%) 64.9223 * -48.12%*
Stddev 1 0.0040 ( 0.00%) 0.0117 (-189.97%)
Stddev 4 0.0046 ( 0.00%) 0.0766 (-1557.56%)
Stddev 7 0.0333 ( 0.00%) 0.0991 (-197.83%)
Stddev 12 0.0425 ( 0.00%) 0.1303 (-206.90%)
Stddev 21 0.0337 ( 0.00%) 0.4138 (-1127.60%)
Stddev 30 0.0295 ( 0.00%) 0.1551 (-424.94%)
Stddev 48 0.0445 ( 0.00%) 0.2056 (-361.71%)
Stddev 79 0.0350 ( 0.00%) 0.4118 (-1076.56%)
Stddev 110 0.0655 ( 0.00%) 0.3685 (-462.72%)
Stddev 141 0.3670 ( 0.00%) 0.5488 ( -49.55%)
Stddev 172 0.7375 ( 0.00%) 1.0806 ( -46.52%)
Stddev 203 0.0817 ( 0.00%) 1.6920 (-1970.11%)
Stddev 234 0.8210 ( 0.00%) 1.4036 ( -70.97%)
Stddev 265 0.9337 ( 0.00%) 1.1025 ( -18.08%)
Stddev 296 1.5688 ( 0.00%) 0.4154 ( 73.52%)
The problem with hackbench is that "1" above doesn't represent 1 task,
it represents 1 group and so the machine gets saturated relatively
quickly and it's super sensitive to cores being idle and available to
make quick progress.
Kernel building which is all anyone ever cares about is a mixed bag
1-socket
Amean elsp-2 420.45 ( 0.00%) 240.80 * 42.73%*
Amean elsp-4 363.54 ( 0.00%) 135.09 * 62.84%*
Amean elsp-8 105.40 ( 0.00%) 131.46 * -24.73%*
Amean elsp-16 106.61 ( 0.00%) 133.57 * -25.29%*
2-socket
Amean elsp-2 406.76 ( 0.00%) 448.57 ( -10.28%)
Amean elsp-4 235.22 ( 0.00%) 289.48 ( -23.07%)
Amean elsp-8 152.36 ( 0.00%) 116.76 ( 23.37%)
Amean elsp-16 64.50 ( 0.00%) 52.12 * 19.20%*
Amean elsp-32 30.28 ( 0.00%) 28.24 * 6.74%*
Amean elsp-64 21.67 ( 0.00%) 23.00 * -6.13%*
Amean elsp-128 20.57 ( 0.00%) 23.57 * -14.60%*
Amean elsp-160 20.64 ( 0.00%) 23.63 * -14.50%*
Stddev elsp-2 75.35 ( 0.00%) 35.00 ( 53.55%)
Stddev elsp-4 71.12 ( 0.00%) 86.09 ( -21.05%)
Stddev elsp-8 43.05 ( 0.00%) 10.67 ( 75.22%)
Stddev elsp-16 4.08 ( 0.00%) 2.31 ( 43.41%)
Stddev elsp-32 0.51 ( 0.00%) 0.76 ( -48.60%)
Stddev elsp-64 0.38 ( 0.00%) 0.61 ( -60.72%)
Stddev elsp-128 0.13 ( 0.00%) 0.41 (-207.53%)
Stddev elsp-160 0.08 ( 0.00%) 0.20 (-147.93%)
1-socket matches other patterns, the 2-socket was weird. Variability was
nuts for low number of jobs. It's also not universal. I had tested in a
2-socket Haswell machine and it showed different results
Amean elsp-2 447.91 ( 0.00%) 467.43 ( -4.36%)
Amean elsp-4 284.47 ( 0.00%) 248.37 ( 12.69%)
Amean elsp-8 166.20 ( 0.00%) 129.23 ( 22.24%)
Amean elsp-16 63.89 ( 0.00%) 55.63 * 12.93%*
Amean elsp-32 36.80 ( 0.00%) 35.87 * 2.54%*
Amean elsp-64 30.97 ( 0.00%) 36.94 * -19.28%*
Amean elsp-96 31.66 ( 0.00%) 37.32 * -17.89%*
Stddev elsp-2 58.08 ( 0.00%) 57.93 ( 0.25%)
Stddev elsp-4 65.31 ( 0.00%) 41.56 ( 36.36%)
Stddev elsp-8 68.32 ( 0.00%) 15.61 ( 77.15%)
Stddev elsp-16 3.68 ( 0.00%) 2.43 ( 33.87%)
Stddev elsp-32 0.29 ( 0.00%) 0.97 (-239.75%)
Stddev elsp-64 0.36 ( 0.00%) 0.24 ( 32.10%)
Stddev elsp-96 0.30 ( 0.00%) 0.31 ( -5.11%)
Still not a perfect match to the general pattern for 2 build jobs and a
bit variable but otherwise the pattern holds -- performs better until the
machine is saturated. Kernel builds (or compilation builds) are always a
bit off as a benchmark as it has a mix of parallel and serialised tasks
that are non-deterministic.
With the NASA Parallel Benchmark (NPB, aka NAS) it's trickier to do a
valid comparison. Over-saturating NAS decimates performance but there
are limits on the exact thread counts that can be used for MPI. OpenMP
is less restrictive but here is an MPI comparison anyway comparing a
fully loaded HT On with fully loaded HT Off -- this is crucial, HT Off
has half the level of parallelisation
Amean bt 771.15 ( 0.00%) 926.98 * -20.21%*
Amean cg 445.92 ( 0.00%) 465.65 * -4.42%*
Amean ep 70.01 ( 0.00%) 97.15 * -38.76%*
Amean is 16.75 ( 0.00%) 19.08 * -13.95%*
Amean lu 882.84 ( 0.00%) 902.60 * -2.24%*
Amean mg 84.10 ( 0.00%) 95.95 * -14.10%*
Amean sp 1353.88 ( 0.00%) 1372.23 * -1.36%*
ep is the embarassingly parallel problem and it shows with half the cores
with HT off, we take a 38.76% performance hit. However, even that is not
universally true as cg for example did not parallelise as well and only
performacne 4.42% worse even with HT off. I can show a comparison with
equal levels of parallelisation but with HT off, it is a completely broken
configuration and I do not think a comparison like that makes any sense.
I didn't do any comparison that could represent Cloud. However, I think
it's worth noting that HT may be popular there for packing lots of virtual
machines onto a single host and over-subscribing. HT would intuitively
have an advantage there *but* it depends heavily on the utilisation and
whether there is sustained VCPU activity where the number of active VCPUs
exceeds physical CPUs when HT is off. There is also the question whether
performance even matters on such configurations but anything cloud related
will be "how long is a piece of string" and "it depends".
So there you have it, HT Off is not a guaranteed loss and can be a gain
so it should be considered as an alternative to core scheduling. The case
where HT makes a big difference is when a workload is CPU or memory bound
and the number of active tasks exceeds the number of CPUs on a socket
and again when number of active tasks exceeds the number of CPUs in the
whole machine.
--
Mel Gorman
SUSE Labs
On Tue, Apr 23, 2019 at 04:18:17PM +0000 Vineeth Remanan Pillai wrote:
> From: Peter Zijlstra (Intel) <[email protected]>
>
> Marks all tasks in a cgroup as matching for core-scheduling.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 4 +++
> 2 files changed, 66 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5066a1493acf..e5bdc1c4d8d7 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6658,6 +6658,15 @@ static void sched_change_group(struct task_struct *tsk, int type)
> tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
> struct task_group, css);
> tg = autogroup_task_group(tsk, tg);
> +
> +#ifdef CONFIG_SCHED_CORE
> + if ((unsigned long)tsk->sched_task_group == tsk->core_cookie)
> + tsk->core_cookie = 0UL;
> +
> + if (tg->tagged /* && !tsk->core_cookie ? */)
> + tsk->core_cookie = (unsigned long)tg;
> +#endif
> +
> tsk->sched_task_group = tg;
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -7117,6 +7126,43 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> }
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> +#ifdef CONFIG_SCHED_CORE
> +static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> +{
> + struct task_group *tg = css_tg(css);
> +
> + return !!tg->tagged;
> +}
> +
> +static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
> +{
> + struct task_group *tg = css_tg(css);
> + struct css_task_iter it;
> + struct task_struct *p;
> +
> + if (val > 1)
> + return -ERANGE;
> +
> + if (tg->tagged == !!val)
> + return 0;
> +
> + tg->tagged = !!val;
> +
> + if (!!val)
> + sched_core_get();
> +
> + css_task_iter_start(css, 0, &it);
> + while ((p = css_task_iter_next(&it)))
> + p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> + css_task_iter_end(&it);
> +
> + if (!val)
> + sched_core_put();
> +
> + return 0;
> +}
> +#endif
> +
> static struct cftype cpu_legacy_files[] = {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> {
> @@ -7152,6 +7198,14 @@ static struct cftype cpu_legacy_files[] = {
> .read_u64 = cpu_rt_period_read_uint,
> .write_u64 = cpu_rt_period_write_uint,
> },
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + {
> + .name = "tag",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = cpu_core_tag_read_u64,
> + .write_u64 = cpu_core_tag_write_u64,
> + },
> #endif
> { } /* Terminate */
> };
> @@ -7319,6 +7373,14 @@ static struct cftype cpu_files[] = {
> .seq_show = cpu_max_show,
> .write = cpu_max_write,
> },
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + {
> + .name = "tag",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = cpu_core_tag_read_u64,
> + .write_u64 = cpu_core_tag_write_u64,
> + },
> #endif
> { } /* terminate */
> };
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 42dd620797d7..16fb236eab7b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -363,6 +363,10 @@ struct cfs_bandwidth {
> struct task_group {
> struct cgroup_subsys_state css;
>
> +#ifdef CONFIG_SCHED_CORE
> + int tagged;
> +#endif
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
> /* schedulable entities of this group on each CPU */
> struct sched_entity **se;
> --
> 2.17.1
>
Since CPU0 never goes through the cpu add code it will never get initialized if
it's the only cpu and then enabling core scheduling and adding a task crashes.
Since there is no point in using core sched in this case maybe just disallow it
with something the below?
Cheers,
Phil
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8e5f26db052..b312ea1e28a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7541,6 +7541,9 @@ static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype
if (val > 1)
return -ERANGE;
+ if (num_online_cpus() <= 1)
+ return -EINVAL;
+
if (tg->tagged == !!val)
return 0;
--
On 23-Apr-2019 04:18:05 PM, Vineeth Remanan Pillai wrote:
> Second iteration of the core-scheduling feature.
>
> This version fixes apparent bugs and performance issues in v1. This
> doesn't fully address the issue of core sharing between processes
> with different tags. Core sharing still happens 1% to 5% of the time
> based on the nature of workload and timing of the runnable processes.
>
> Changes in v2
> -------------
> - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
Here are our benchmark results.
Environment setup:
------------------
Skylake server, 2 numa nodes, total 72 CPUs with HT on
Workload in KVM virtual machines, one cpu cgroup per VM (including qemu
and vhost threads)
Case 1: MySQL TPC-C
-------------------
1 12-vcpus-32gb MySQL server per numa node (clients on another physical
machine)
96 semi-idle 1-vcpu-512mb VM per numa node (sending metrics over a VPN
every 15 seconds)
--> 3 vcpus per physical CPU
Average of 10 5-minutes runs.
- baseline:
- avg tps: 1878
- stdev tps: 47
- nosmt:
- avg tps: 959 (-49% from baseline)
- stdev tps: 35
- core scheduling:
- avg tps: 1406 (-25% from baseline)
- stdev tps: 48
- Co-scheduling stats (5 minutes sample):
- 48.9% VM threads
- 49.6% idle
- 1.3% foreign threads
So in the v2, the case with a very noisy test, benefits from core
scheduling (the baseline is also better compared to v1 so we probably
benefit from other changes in the kernel).
Case 2: linpack with enough room
--------------------------------
2 12-vcpus-32gb linpack VMs both pinned on the same NUMA node (36
hardware threads with SMT on).
100k context switches/sec.
Average of 5 15-minutes runs.
- baseline:
- avg gflops: 403
- stdev: 20
- nosmt:
- avg gflops: 355 (-12% from baseline)
- stdev: 28
- core scheduling:
- avg gflops: 364 (-9% from baseline)
- stdev: 59
- Co-scheduling stats (5 minutes sample):
- 39.3% VM threads
- 59.3% idle
- 0.07% foreign threads
No real difference between nosmt and core scheduling when there is
enough room to run a cpu-intensive workload even with smt off.
Case 3: full node linpack
-------------------------
3 12-vcpus-32gb linpack VMs all pinned on the same NUMA node (36
hardware threads with SMT on).
155k context switches/sec
Average of 5 15-minutes runs.
- baseline:
- avg gflops: 270
- stdev: 5
- nosmt (switching to 2:1 ratio of vcpu to hardware threads):
- avg gflops: 209 (-22.46% from baseline)
- stdev: 6.2
- core scheduling
- avg gflops: 269 (-0.11% from baseline)
- stdev: 5.7
- Co-scheduling stats (5 minutes sample):
- 93.7% VM threads
- 6.3% idle
- 0.04% foreign threads
Here the core scheduling is a major improvement in terms of performance
compared to nosmt.
Julien
* Thomas Gleixner <[email protected]> wrote:
> It exists already: /sys/devices/system/cpu/smt/control
>
> Setting it to off will offline all siblings, on will online them again.
Indeed, added by 05736e4ac13c last year (and I promptly forgot about it
...) - I was thrown off a bit by the 'nosmt' flag Mel used, but that's
probably because he used an older kernel.
Thanks,
Ingo
* Mel Gorman <[email protected]> wrote:
> On Thu, Apr 25, 2019 at 11:55:08AM +0200, Ingo Molnar wrote:
> > > > Would it be possible to post the results with HT off as well ?
> > >
> > > What's the point here to turn HT off? The latency is sensitive to the
> > > relationship
> > > between the task number and CPU number. Usually less CPU number, more run
> > > queue wait time, and worse result.
> >
> > HT-off numbers are mandatory: turning HT off is by far the simplest way
> > to solve the security bugs in these CPUs.
> >
> > Any core-scheduling solution *must* perform better than HT-off for all
> > relevant workloads, otherwise what's the point?
> >
>
> I agree. Not only should HT-off be evaluated but it should properly
> evaluate for different levels of machine utilisation to get a complete
> picture.
>
> Around the same time this was first posted and because of kernel
> warnings from L1TF, I did a preliminary evaluation of HT On vs HT Off
> using nosmt -- this is sub-optimal in itself but it was convenient. The
> conventional wisdom that HT gets a 30% boost appears to be primarily based
> on academic papers evaluating HPC workloads on a Pentium 4 with a focus
> on embarassingly parallel problems which is the ideal case for HT but not
> the universal case. The conventional wisdom is questionable at best. The
> only modern comparisons I could find were focused on games primarily
> which I think hit scaling limits before HT is a factor in some cases.
>
> I don't have the data in a format that can be present everything in a clear
> format but here is an attempt anyway. This is long but the central point
> that when when a machine is lightly loaded, HT Off generally performs
> better than HT On and even when heavily utilised, it's still not a
> guaranteed loss. I only suggest reading after this if you have coffee
> and time. Ideally all this would be updated with a comparison to core
> scheduling but I may not get it queued on my test grid before I leave
> for LSF/MM and besides, the authors pushing this feature should be able
> to provide supporting data justifying the complexity of the series.
BTW., a side note: I'd suggest introducing a runtime toggle 'nosmt'
facility, i.e. turn a system between SMT and non-SMT execution runtime,
with full reversability between these states and no restrictions.
That should make both benchmarking more convenient (no kernel reboots and
kernel parameters to check), and it would also make it easier for system
administrators to experiment with how SMT and no-SMT affects their
typical workloads.
> Here is a tbench comparison scaling from a low thread count to a high
> thread count. I picked tbench because it's relatively uncomplicated and
> tends to be reasonable at spotting scheduler regressions. The kernel
> version is old but for the purposes of this discussion, it doesn't matter
>
> 1-socket Skylake (8 logical CPUs HT On, 4 logical CPUs HT Off)
Side question: while obviously most of the core-sched interest is
concentrated around Intel's HyperThreading SMT, I'm wondering whether you
have any data regarding AMD systems - in particular Ryzen based CPUs
appear to have a pretty robust SMT implementation.
> Hmean 1 484.00 ( 0.00%) 519.95 * 7.43%*
> Hmean 2 925.02 ( 0.00%) 1022.28 * 10.51%*
> Hmean 4 1730.34 ( 0.00%) 2029.81 * 17.31%*
> Hmean 8 2883.57 ( 0.00%) 2040.89 * -29.22%*
> Hmean 16 2830.61 ( 0.00%) 2039.74 * -27.94%*
> Hmean 32 2855.54 ( 0.00%) 2042.70 * -28.47%*
> Stddev 1 1.16 ( 0.00%) 0.62 ( 46.43%)
> Stddev 2 1.31 ( 0.00%) 1.00 ( 23.32%)
> Stddev 4 4.89 ( 0.00%) 12.86 (-163.14%)
> Stddev 8 4.30 ( 0.00%) 2.53 ( 40.99%)
> Stddev 16 3.38 ( 0.00%) 5.92 ( -75.08%)
> Stddev 32 5.47 ( 0.00%) 14.28 (-160.77%)
>
> Note that disabling HT performs better when cores are available but hits
> scaling limits past 4 CPUs when the machine is saturated with HT off.
> It's similar with 2 sockets
>
> 2-socket Broadwell (80 logical CPUs HT On, 40 logical CPUs HT Off)
>
> smt nosmt
> Hmean 1 514.28 ( 0.00%) 540.90 * 5.18%*
> Hmean 2 982.19 ( 0.00%) 1042.98 * 6.19%*
> Hmean 4 1820.02 ( 0.00%) 1943.38 * 6.78%*
> Hmean 8 3356.73 ( 0.00%) 3655.92 * 8.91%*
> Hmean 16 6240.53 ( 0.00%) 7057.57 * 13.09%*
> Hmean 32 10584.60 ( 0.00%) 15934.82 * 50.55%*
> Hmean 64 24967.92 ( 0.00%) 21103.79 * -15.48%*
> Hmean 128 27106.28 ( 0.00%) 20822.46 * -23.18%*
> Hmean 256 28345.15 ( 0.00%) 21625.67 * -23.71%*
> Hmean 320 28358.54 ( 0.00%) 21768.70 * -23.24%*
> Stddev 1 2.10 ( 0.00%) 3.44 ( -63.59%)
> Stddev 2 2.46 ( 0.00%) 4.83 ( -95.91%)
> Stddev 4 7.57 ( 0.00%) 6.14 ( 18.86%)
> Stddev 8 6.53 ( 0.00%) 11.80 ( -80.79%)
> Stddev 16 11.23 ( 0.00%) 16.03 ( -42.74%)
> Stddev 32 18.99 ( 0.00%) 22.04 ( -16.10%)
> Stddev 64 10.86 ( 0.00%) 14.31 ( -31.71%)
> Stddev 128 25.10 ( 0.00%) 16.08 ( 35.93%)
> Stddev 256 29.95 ( 0.00%) 71.39 (-138.36%)
>
> Same -- performance is better until the machine gets saturated and
> disabling HT hits scaling limits earlier.
Interesting. This strongly suggests sub-optimal SMT-scheduling in the
non-saturated HT case, i.e. a scheduler balancing bug.
As long as loads are clearly below the physical cores count (which they
are in the early phases of your table) the scheduler should spread tasks
without overlapping two tasks on the same core.
Clearly it doesn't.
> SpecJBB 2005 is ancient but it does lend itself to easily scaling the
> number of active tasks so here is a sample of the performance as
> utilisation ramped up to saturation
>
> 2-socket
> Hmean tput-1 48655.00 ( 0.00%) 48762.00 * 0.22%*
> Hmean tput-8 387341.00 ( 0.00%) 390062.00 * 0.70%*
> Hmean tput-15 660993.00 ( 0.00%) 659832.00 * -0.18%*
> Hmean tput-22 916898.00 ( 0.00%) 913570.00 * -0.36%*
> Hmean tput-29 1178601.00 ( 0.00%) 1169843.00 * -0.74%*
> Hmean tput-36 1292377.00 ( 0.00%) 1387003.00 * 7.32%*
> Hmean tput-43 1458913.00 ( 0.00%) 1508172.00 * 3.38%*
> Hmean tput-50 1411975.00 ( 0.00%) 1513536.00 * 7.19%*
> Hmean tput-57 1417937.00 ( 0.00%) 1495513.00 * 5.47%*
> Hmean tput-64 1396242.00 ( 0.00%) 1477433.00 * 5.81%*
> Hmean tput-71 1349055.00 ( 0.00%) 1472856.00 * 9.18%*
> Hmean tput-78 1265738.00 ( 0.00%) 1453846.00 * 14.86%*
> Hmean tput-79 1307367.00 ( 0.00%) 1446572.00 * 10.65%*
> Hmean tput-80 1309718.00 ( 0.00%) 1449384.00 * 10.66%*
>
> This was the most surprising result -- HT off was generally a benefit
> even when the counts were higher than the available CPUs and I'm not
> sure why. It's also interesting with HT off that the chances of keeping
> a workload local to a node are reduced as a socket gets saturated earlier
> but the load balancer is generally moving tasks around and NUMA Balancing
> is also in play. Still, it shows that disabling HT is not a universal loss.
Interesting indeed. Could there be some batch execution benefit, i.e. by
having fewer CPUs to execute on the tasks do not crowd out and trash
die/socket level caches as badly? With no-HT the workload had more
threads than CPUs to execute on and the tasks were forced into neat
queues of execution and cache trashing would be limited to the short
period after a task was scheduled in?
If this was on the 40-physical-core Broadwell system and the 'X' tput-X
roughly correlates to CPU utilization then this seems plausible, as the
improvements start roughly at the ~tput-40 bondary and increase
afterwards.
> netperf is inherently about two tasks. For UDP_STREAM, it shows almost
> no difference and it's within noise. TCP_STREAM was interesting
>
> Hmean 64 1154.23 ( 0.00%) 1162.69 * 0.73%*
> Hmean 128 2194.67 ( 0.00%) 2230.90 * 1.65%*
> Hmean 256 3867.89 ( 0.00%) 3929.99 * 1.61%*
> Hmean 1024 12714.52 ( 0.00%) 12913.81 * 1.57%*
> Hmean 2048 21141.11 ( 0.00%) 21266.89 ( 0.59%)
> Hmean 3312 27945.71 ( 0.00%) 28354.82 ( 1.46%)
> Hmean 4096 30594.24 ( 0.00%) 30666.15 ( 0.24%)
> Hmean 8192 37462.58 ( 0.00%) 36901.45 ( -1.50%)
> Hmean 16384 42947.02 ( 0.00%) 43565.98 * 1.44%*
> Stddev 64 2.21 ( 0.00%) 4.02 ( -81.62%)
> Stddev 128 18.45 ( 0.00%) 11.11 ( 39.79%)
> Stddev 256 30.84 ( 0.00%) 22.10 ( 28.33%)
> Stddev 1024 141.46 ( 0.00%) 56.54 ( 60.03%)
> Stddev 2048 200.39 ( 0.00%) 75.56 ( 62.29%)
> Stddev 3312 411.11 ( 0.00%) 286.97 ( 30.20%)
> Stddev 4096 299.86 ( 0.00%) 322.44 ( -7.53%)
> Stddev 8192 418.80 ( 0.00%) 635.63 ( -51.77%)
> Stddev 16384 661.57 ( 0.00%) 206.73 ( 68.75%)
>
> The performance difference is marginal but variance is much reduced
> by disabling HT. Now, it's important to note that this particular test
> did not control for c-states and it did not bind tasks so there are a
> lot of potential sources of noise. I didn't control for them because
> I don't think many normal users would properly take concerns like that
> into account. MMtests is able to control for those factors so it could
> be independently checked.
Interesting. This too suggests suboptimal scheduling: with just 2 tasks
there might be two major modes of execution: either the two tasks end up
on the same physical core or not. If the scheduler isn't entirely
consistent about this choice then we might see big variations in
execution, depending on whether running the two tasks on different
physical cores is better to performance or not.
This stddev artifact could be narrowed down further by using taskset to
force the benchmark on 2 logical CPUs, and by making those 2 CPUs HT
siblings or not we could see which execution is the more optimal one.
My prediction, which is easily falsifiable is that stddev noise should
reduce dramatically in such a 2-CPU restricted 'taskset' based affinity
jail, *regardless* of whether the two CPUs are actually on the same
physical core or not.
> hackbench is the most obvious loser. This is for processes communicating
> via pipes.
>
> Amean 1 0.7343 ( 0.00%) 1.1377 * -54.93%*
> Amean 4 1.1647 ( 0.00%) 2.1543 * -84.97%*
> Amean 7 1.6770 ( 0.00%) 3.1300 * -86.64%*
> Amean 12 2.4500 ( 0.00%) 4.6447 * -89.58%*
> Amean 21 3.9927 ( 0.00%) 6.8250 * -70.94%*
> Amean 30 5.5320 ( 0.00%) 8.6433 * -56.24%*
> Amean 48 8.4723 ( 0.00%) 12.1890 * -43.87%*
> Amean 79 12.3760 ( 0.00%) 17.8347 * -44.11%*
> Amean 110 16.0257 ( 0.00%) 23.1373 * -44.38%*
> Amean 141 20.7070 ( 0.00%) 29.8537 * -44.17%*
> Amean 172 25.1507 ( 0.00%) 37.4830 * -49.03%*
> Amean 203 28.5303 ( 0.00%) 43.5220 * -52.55%*
> Amean 234 33.8233 ( 0.00%) 51.5403 * -52.38%*
> Amean 265 37.8703 ( 0.00%) 58.1860 * -53.65%*
> Amean 296 43.8303 ( 0.00%) 64.9223 * -48.12%*
> Stddev 1 0.0040 ( 0.00%) 0.0117 (-189.97%)
> Stddev 4 0.0046 ( 0.00%) 0.0766 (-1557.56%)
> Stddev 7 0.0333 ( 0.00%) 0.0991 (-197.83%)
> Stddev 12 0.0425 ( 0.00%) 0.1303 (-206.90%)
> Stddev 21 0.0337 ( 0.00%) 0.4138 (-1127.60%)
> Stddev 30 0.0295 ( 0.00%) 0.1551 (-424.94%)
> Stddev 48 0.0445 ( 0.00%) 0.2056 (-361.71%)
> Stddev 79 0.0350 ( 0.00%) 0.4118 (-1076.56%)
> Stddev 110 0.0655 ( 0.00%) 0.3685 (-462.72%)
> Stddev 141 0.3670 ( 0.00%) 0.5488 ( -49.55%)
> Stddev 172 0.7375 ( 0.00%) 1.0806 ( -46.52%)
> Stddev 203 0.0817 ( 0.00%) 1.6920 (-1970.11%)
> Stddev 234 0.8210 ( 0.00%) 1.4036 ( -70.97%)
> Stddev 265 0.9337 ( 0.00%) 1.1025 ( -18.08%)
> Stddev 296 1.5688 ( 0.00%) 0.4154 ( 73.52%)
>
> The problem with hackbench is that "1" above doesn't represent 1 task,
> it represents 1 group and so the machine gets saturated relatively
> quickly and it's super sensitive to cores being idle and available to
> make quick progress.
hackbench is also super sensitive to the same group of ~20 tasks being
able to progress at once, and hence is pretty noisy.
The flip-over between hackbench being able to progress effectively and a
half-scheduled group hindering all the others seems to be super
non-deterministic and can be triggered by random events both within
hackbench, and other things happening on the machine.
So while hackbench is somewhat artificial in its intensity and load
levels, it still matches messaging server peak loads so it's still
consider it an imporant metric of scheduling quality.
I'm wondering whether the scheduler could do anything to reduce the
non-determinism of hackbench.
BTW., note that 'perf bench scheduling' is a hackbench work-alike:
dagon:~/tip> perf bench sched messaging
# Running 'sched/messaging' benchmark:
# 20 sender and receiver processes per group
# 10 groups == 400 processes run
Total time: 0.158 [sec]
It also has a threaded variant (which is a hackbench-pthread work-alike):
dagon:~/tip> perf bench sched messaging --thread --group 20
# Running 'sched/messaging' benchmark:
# 20 sender and receiver threads per group
# 20 groups == 800 threads run
Total time: 0.265 [sec]
I'm trying to distill the most important scheduler micro-benchmarks into
'perf bench':
dagon:~/tip> perf bench sched
# List of available benchmarks for collection 'sched':
messaging: Benchmark for scheduling and IPC
pipe: Benchmark for pipe() between two processes
all: Run all scheduler benchmarks
which is still stuck at a very low count of 2 benchmarks currently.
:-)
> Kernel building which is all anyone ever cares about is a mixed bag
>
> 1-socket
> Amean elsp-2 420.45 ( 0.00%) 240.80 * 42.73%*
> Amean elsp-4 363.54 ( 0.00%) 135.09 * 62.84%*
> Amean elsp-8 105.40 ( 0.00%) 131.46 * -24.73%*
> Amean elsp-16 106.61 ( 0.00%) 133.57 * -25.29%*
>
> 2-socket
> Amean elsp-2 406.76 ( 0.00%) 448.57 ( -10.28%)
> Amean elsp-4 235.22 ( 0.00%) 289.48 ( -23.07%)
> Amean elsp-8 152.36 ( 0.00%) 116.76 ( 23.37%)
> Amean elsp-16 64.50 ( 0.00%) 52.12 * 19.20%*
> Amean elsp-32 30.28 ( 0.00%) 28.24 * 6.74%*
> Amean elsp-64 21.67 ( 0.00%) 23.00 * -6.13%*
> Amean elsp-128 20.57 ( 0.00%) 23.57 * -14.60%*
> Amean elsp-160 20.64 ( 0.00%) 23.63 * -14.50%*
> Stddev elsp-2 75.35 ( 0.00%) 35.00 ( 53.55%)
> Stddev elsp-4 71.12 ( 0.00%) 86.09 ( -21.05%)
> Stddev elsp-8 43.05 ( 0.00%) 10.67 ( 75.22%)
> Stddev elsp-16 4.08 ( 0.00%) 2.31 ( 43.41%)
> Stddev elsp-32 0.51 ( 0.00%) 0.76 ( -48.60%)
> Stddev elsp-64 0.38 ( 0.00%) 0.61 ( -60.72%)
> Stddev elsp-128 0.13 ( 0.00%) 0.41 (-207.53%)
> Stddev elsp-160 0.08 ( 0.00%) 0.20 (-147.93%)
>
> 1-socket matches other patterns, the 2-socket was weird. Variability was
> nuts for low number of jobs. It's also not universal. I had tested in a
> 2-socket Haswell machine and it showed different results
>
> Amean elsp-2 447.91 ( 0.00%) 467.43 ( -4.36%)
> Amean elsp-4 284.47 ( 0.00%) 248.37 ( 12.69%)
> Amean elsp-8 166.20 ( 0.00%) 129.23 ( 22.24%)
> Amean elsp-16 63.89 ( 0.00%) 55.63 * 12.93%*
> Amean elsp-32 36.80 ( 0.00%) 35.87 * 2.54%*
> Amean elsp-64 30.97 ( 0.00%) 36.94 * -19.28%*
> Amean elsp-96 31.66 ( 0.00%) 37.32 * -17.89%*
> Stddev elsp-2 58.08 ( 0.00%) 57.93 ( 0.25%)
> Stddev elsp-4 65.31 ( 0.00%) 41.56 ( 36.36%)
> Stddev elsp-8 68.32 ( 0.00%) 15.61 ( 77.15%)
> Stddev elsp-16 3.68 ( 0.00%) 2.43 ( 33.87%)
> Stddev elsp-32 0.29 ( 0.00%) 0.97 (-239.75%)
> Stddev elsp-64 0.36 ( 0.00%) 0.24 ( 32.10%)
> Stddev elsp-96 0.30 ( 0.00%) 0.31 ( -5.11%)
>
> Still not a perfect match to the general pattern for 2 build jobs and a
> bit variable but otherwise the pattern holds -- performs better until the
> machine is saturated. Kernel builds (or compilation builds) are always a
> bit off as a benchmark as it has a mix of parallel and serialised tasks
> that are non-deterministic.
Interesting.
Here too I'm wondering whether the scheduler could do something to
improve the saturated case: which *is* an important workload, as kernel
hackers tend to over-load their systems a bit when building kernel, to
make sure the system is at least 100% utilized. ;-)
Probably not though, without injecting too much policy. We could perhaps
repurpose SCHED_BATCH to be even more batch scheduling, i.e. to reduce
the non-determinism of the over-loaded machines in an even more assertive
manner? As long as there's enough RAM and no serios async IO the kbuild
gets perturbed by (which should be true these days) this should be
possible to do. We could then stick SCHED_BATCH into the kbuild process,
to make more than 0.01% of kernel developers use it. Win-win. :-)
> With the NASA Parallel Benchmark (NPB, aka NAS) it's trickier to do a
> valid comparison. Over-saturating NAS decimates performance but there
> are limits on the exact thread counts that can be used for MPI. OpenMP
> is less restrictive but here is an MPI comparison anyway comparing a
> fully loaded HT On with fully loaded HT Off -- this is crucial, HT Off
> has half the level of parallelisation
>
> Amean bt 771.15 ( 0.00%) 926.98 * -20.21%*
> Amean cg 445.92 ( 0.00%) 465.65 * -4.42%*
> Amean ep 70.01 ( 0.00%) 97.15 * -38.76%*
> Amean is 16.75 ( 0.00%) 19.08 * -13.95%*
> Amean lu 882.84 ( 0.00%) 902.60 * -2.24%*
> Amean mg 84.10 ( 0.00%) 95.95 * -14.10%*
> Amean sp 1353.88 ( 0.00%) 1372.23 * -1.36%*
>
> ep is the embarassingly parallel problem and it shows with half the cores
> with HT off, we take a 38.76% performance hit. However, even that is not
> universally true as cg for example did not parallelise as well and only
> performacne 4.42% worse even with HT off.
Very interesting. I'm wondering what kind of workload 'ep' is exactly,
and would love to have a work-alike in 'perf sched bench'.
Do these benchmarks over-saturate by default, and is this really
representative of how all the large compute cluster folks are *using*
MPI?
I thought the more common pattern was to closely tailor MPI parallelism
to available (logical) cores parallelism, to minimize shared cache
trashing in an oversubscribed scenario, but I could be wrong.
> I can show a comparison with equal levels of parallelisation but with
> HT off, it is a completely broken configuration and I do not think a
> comparison like that makes any sense.
I would still be interested in that comparison, because I'd like
to learn whether there's any true *inherent* performance advantage to
HyperThreading for that particular workload, for exactly tuned
parallelism.
Even if nobody is going to run the NPB/NAS benchmark that way.
> I didn't do any comparison that could represent Cloud. However, I think
> it's worth noting that HT may be popular there for packing lots of virtual
> machines onto a single host and over-subscribing. HT would intuitively
> have an advantage there *but* it depends heavily on the utilisation and
> whether there is sustained VCPU activity where the number of active VCPUs
> exceeds physical CPUs when HT is off. There is also the question whether
> performance even matters on such configurations but anything cloud related
> will be "how long is a piece of string" and "it depends".
Intuitively I'd guess that because all the cloud providers are pushing
for core-sched HT is probably a win in cloud benchmarks, if not for the
pesky security problems. ;-)
> So there you have it, HT Off is not a guaranteed loss and can be a gain
> so it should be considered as an alternative to core scheduling. The case
> where HT makes a big difference is when a workload is CPU or memory bound
> and the number of active tasks exceeds the number of CPUs on a socket
> and again when number of active tasks exceeds the number of CPUs in the
> whole machine.
Fascinating measurements, thanks a lot Mel for doing these!
This is super useful.
Thanks,
Ingo
On Thu, 25 Apr 2019, Ingo Molnar wrote:
> * Mel Gorman <[email protected]> wrote:
> > I don't have the data in a format that can be present everything in a clear
> > format but here is an attempt anyway. This is long but the central point
> > that when when a machine is lightly loaded, HT Off generally performs
> > better than HT On and even when heavily utilised, it's still not a
> > guaranteed loss. I only suggest reading after this if you have coffee
> > and time. Ideally all this would be updated with a comparison to core
> > scheduling but I may not get it queued on my test grid before I leave
> > for LSF/MM and besides, the authors pushing this feature should be able
> > to provide supporting data justifying the complexity of the series.
>
> BTW., a side note: I'd suggest introducing a runtime toggle 'nosmt'
> facility, i.e. turn a system between SMT and non-SMT execution runtime,
> with full reversability between these states and no restrictions.
It exists already: /sys/devices/system/cpu/smt/control
Setting it to off will offline all siblings, on will online them again.
Thanks,
tglx
On Thu, Apr 25, 2019 at 08:53:43PM +0200, Ingo Molnar wrote:
> > I don't have the data in a format that can be present everything in a clear
> > format but here is an attempt anyway. This is long but the central point
> > that when when a machine is lightly loaded, HT Off generally performs
> > better than HT On and even when heavily utilised, it's still not a
> > guaranteed loss. I only suggest reading after this if you have coffee
> > and time. Ideally all this would be updated with a comparison to core
> > scheduling but I may not get it queued on my test grid before I leave
> > for LSF/MM and besides, the authors pushing this feature should be able
> > to provide supporting data justifying the complexity of the series.
>
> BTW., a side note: I'd suggest introducing a runtime toggle 'nosmt'
> facility, i.e. turn a system between SMT and non-SMT execution runtime,
> with full reversability between these states and no restrictions.
>
> That should make both benchmarking more convenient (no kernel reboots and
> kernel parameters to check), and it would also make it easier for system
> administrators to experiment with how SMT and no-SMT affects their
> typical workloads.
>
Noted, I wasn't aware of the option Thomas laid out but even if I was, I
probably would have used the boot parameter anyway. The grid automation
reboots between tests and it knows how to add/remove kernel command
lines so it's trivial for me to setup. There is definite value for live
experimentation as long as they know to keep an eye on the CPU enumeration
when setting up cpumasks.
> > Here is a tbench comparison scaling from a low thread count to a high
> > thread count. I picked tbench because it's relatively uncomplicated and
> > tends to be reasonable at spotting scheduler regressions. The kernel
> > version is old but for the purposes of this discussion, it doesn't matter
> >
> > 1-socket Skylake (8 logical CPUs HT On, 4 logical CPUs HT Off)
>
> Side question: while obviously most of the core-sched interest is
> concentrated around Intel's HyperThreading SMT, I'm wondering whether you
> have any data regarding AMD systems - in particular Ryzen based CPUs
> appear to have a pretty robust SMT implementation.
>
Unfortunately not. Such machines are available internally but they are
heavily used for functional enablement. This might change in the future
and if so, I'll queue the test.
> > 2-socket Broadwell (80 logical CPUs HT On, 40 logical CPUs HT Off)
> >
> > smt nosmt
> > Hmean 1 514.28 ( 0.00%) 540.90 * 5.18%*
> > Hmean 2 982.19 ( 0.00%) 1042.98 * 6.19%*
> > Hmean 4 1820.02 ( 0.00%) 1943.38 * 6.78%*
> > Hmean 8 3356.73 ( 0.00%) 3655.92 * 8.91%*
> > Hmean 16 6240.53 ( 0.00%) 7057.57 * 13.09%*
> > Hmean 32 10584.60 ( 0.00%) 15934.82 * 50.55%*
> > Hmean 64 24967.92 ( 0.00%) 21103.79 * -15.48%*
> > Hmean 128 27106.28 ( 0.00%) 20822.46 * -23.18%*
> > Hmean 256 28345.15 ( 0.00%) 21625.67 * -23.71%*
> > Hmean 320 28358.54 ( 0.00%) 21768.70 * -23.24%*
> > Stddev 1 2.10 ( 0.00%) 3.44 ( -63.59%)
> > Stddev 2 2.46 ( 0.00%) 4.83 ( -95.91%)
> > Stddev 4 7.57 ( 0.00%) 6.14 ( 18.86%)
> > Stddev 8 6.53 ( 0.00%) 11.80 ( -80.79%)
> > Stddev 16 11.23 ( 0.00%) 16.03 ( -42.74%)
> > Stddev 32 18.99 ( 0.00%) 22.04 ( -16.10%)
> > Stddev 64 10.86 ( 0.00%) 14.31 ( -31.71%)
> > Stddev 128 25.10 ( 0.00%) 16.08 ( 35.93%)
> > Stddev 256 29.95 ( 0.00%) 71.39 (-138.36%)
> >
> > Same -- performance is better until the machine gets saturated and
> > disabling HT hits scaling limits earlier.
>
> Interesting. This strongly suggests sub-optimal SMT-scheduling in the
> non-saturated HT case, i.e. a scheduler balancing bug.
>
Yeah, it does but mpstat didn't appear to indicate that SMT siblings are
being used prematurely so it's a bit of a curiousity.
> As long as loads are clearly below the physical cores count (which they
> are in the early phases of your table) the scheduler should spread tasks
> without overlapping two tasks on the same core.
>
It should, but it's not perfect. For example, wake_affine_idle does not
take sibling activity into account even though select_idle_sibling *may*
take it into account. Even select_idle_sibling in its fast path may use
an SMT sibling instead of searching.
There are also potential side-effects with cpuidle. Some workloads
migration around the socket as they are communicating because of how the
search for an idle CPU works. With SMT on, there is potentially a longer
opportunity for a core to reach a deep c-state and incur a bigger wakeup
latency. This is a very weak theory but I've seen cases where latency
sensitive workloads with only two communicating tasks are affected by
CPUs reaching low c-states due to migrations.
> Clearly it doesn't.
>
It's more that it's best effort to wakeup quickly instead of being perfect
by using an expensive search every time.
> > SpecJBB 2005 is ancient but it does lend itself to easily scaling the
> > number of active tasks so here is a sample of the performance as
> > utilisation ramped up to saturation
> >
> > 2-socket
> > Hmean tput-1 48655.00 ( 0.00%) 48762.00 * 0.22%*
> > Hmean tput-8 387341.00 ( 0.00%) 390062.00 * 0.70%*
> > Hmean tput-15 660993.00 ( 0.00%) 659832.00 * -0.18%*
> > Hmean tput-22 916898.00 ( 0.00%) 913570.00 * -0.36%*
> > Hmean tput-29 1178601.00 ( 0.00%) 1169843.00 * -0.74%*
> > Hmean tput-36 1292377.00 ( 0.00%) 1387003.00 * 7.32%*
> > Hmean tput-43 1458913.00 ( 0.00%) 1508172.00 * 3.38%*
> > Hmean tput-50 1411975.00 ( 0.00%) 1513536.00 * 7.19%*
> > Hmean tput-57 1417937.00 ( 0.00%) 1495513.00 * 5.47%*
> > Hmean tput-64 1396242.00 ( 0.00%) 1477433.00 * 5.81%*
> > Hmean tput-71 1349055.00 ( 0.00%) 1472856.00 * 9.18%*
> > Hmean tput-78 1265738.00 ( 0.00%) 1453846.00 * 14.86%*
> > Hmean tput-79 1307367.00 ( 0.00%) 1446572.00 * 10.65%*
> > Hmean tput-80 1309718.00 ( 0.00%) 1449384.00 * 10.66%*
> >
> > This was the most surprising result -- HT off was generally a benefit
> > even when the counts were higher than the available CPUs and I'm not
> > sure why. It's also interesting with HT off that the chances of keeping
> > a workload local to a node are reduced as a socket gets saturated earlier
> > but the load balancer is generally moving tasks around and NUMA Balancing
> > is also in play. Still, it shows that disabling HT is not a universal loss.
>
> Interesting indeed. Could there be some batch execution benefit, i.e. by
> having fewer CPUs to execute on the tasks do not crowd out and trash
> die/socket level caches as badly?
That could be the case. It also could be an example where tasks getting
starved allow others to make more progress and the high-level metric looks
better. That is usually a pattern seen with IO though, not CPU scheduling.
> With no-HT the workload had more
> threads than CPUs to execute on and the tasks were forced into neat
> queues of execution and cache trashing would be limited to the short
> period after a task was scheduled in?
>
> If this was on the 40-physical-core Broadwell system and the 'X' tput-X
> roughly correlates to CPU utilization then this seems plausible, as the
> improvements start roughly at the ~tput-40 bondary and increase
> afterwards.
>
Indeed, it's very plausible.
> > netperf is inherently about two tasks. For UDP_STREAM, it shows almost
> > no difference and it's within noise. TCP_STREAM was interesting
> >
> > Hmean 64 1154.23 ( 0.00%) 1162.69 * 0.73%*
> > Hmean 128 2194.67 ( 0.00%) 2230.90 * 1.65%*
> > Hmean 256 3867.89 ( 0.00%) 3929.99 * 1.61%*
> > Hmean 1024 12714.52 ( 0.00%) 12913.81 * 1.57%*
> > Hmean 2048 21141.11 ( 0.00%) 21266.89 ( 0.59%)
> > Hmean 3312 27945.71 ( 0.00%) 28354.82 ( 1.46%)
> > Hmean 4096 30594.24 ( 0.00%) 30666.15 ( 0.24%)
> > Hmean 8192 37462.58 ( 0.00%) 36901.45 ( -1.50%)
> > Hmean 16384 42947.02 ( 0.00%) 43565.98 * 1.44%*
> > Stddev 64 2.21 ( 0.00%) 4.02 ( -81.62%)
> > Stddev 128 18.45 ( 0.00%) 11.11 ( 39.79%)
> > Stddev 256 30.84 ( 0.00%) 22.10 ( 28.33%)
> > Stddev 1024 141.46 ( 0.00%) 56.54 ( 60.03%)
> > Stddev 2048 200.39 ( 0.00%) 75.56 ( 62.29%)
> > Stddev 3312 411.11 ( 0.00%) 286.97 ( 30.20%)
> > Stddev 4096 299.86 ( 0.00%) 322.44 ( -7.53%)
> > Stddev 8192 418.80 ( 0.00%) 635.63 ( -51.77%)
> > Stddev 16384 661.57 ( 0.00%) 206.73 ( 68.75%)
> >
> > The performance difference is marginal but variance is much reduced
> > by disabling HT. Now, it's important to note that this particular test
> > did not control for c-states and it did not bind tasks so there are a
> > lot of potential sources of noise. I didn't control for them because
> > I don't think many normal users would properly take concerns like that
> > into account. MMtests is able to control for those factors so it could
> > be independently checked.
>
> Interesting. This too suggests suboptimal scheduling: with just 2 tasks
> there might be two major modes of execution: either the two tasks end up
> on the same physical core or not. If the scheduler isn't entirely
> consistent about this choice then we might see big variations in
> execution, depending on whether running the two tasks on different
> physical cores is better to performance or not.
>
netperf is interesting because ksoftirqd is also involved so it's actually
three tasks that are communicating. Typically SMT siblings are not used
by the communicating task but they get intermittently migrated to new
cores even though the machine is mostly idle.
> This stddev artifact could be narrowed down further by using taskset to
> force the benchmark on 2 logical CPUs, and by making those 2 CPUs HT
> siblings or not we could see which execution is the more optimal one.
So this test was based on config-global-dhp__network-netperf-unbound from
mmtests. There are also config-global-dhp__network-netperf-cross-socket.
What that configuration does is pin the server and client to two CPUs
that are on the same socket but not HT siblings (HT siblings is done by
config-global-dhp__network-netperf-cross-ht). The two cross-* configs also
set c-state to 1 because the tasks do not always have equal utilisation
allow c-state exit latency to cause variants. That said, the effect is
much more visible on sockperf than it is on netperf.
Assuming I do another round, I'll add the configs that pin tasks and
control for c-states.
>
> My prediction, which is easily falsifiable is that stddev noise should
> reduce dramatically in such a 2-CPU restricted 'taskset' based affinity
> jail, *regardless* of whether the two CPUs are actually on the same
> physical core or not.
>
I can confirm that you are right when sockperf is used. That reports
per-packet latencies so variance is easier to spot. Every time I've
optimised for hackbench though, something else fell down a hole that was
more realistic so I usually give up and try again later.
> > hackbench is the most obvious loser. This is for processes communicating
> > via pipes.
> >
> > Amean 1 0.7343 ( 0.00%) 1.1377 * -54.93%*
> > Amean 4 1.1647 ( 0.00%) 2.1543 * -84.97%*
> > Amean 7 1.6770 ( 0.00%) 3.1300 * -86.64%*
> > Amean 12 2.4500 ( 0.00%) 4.6447 * -89.58%*
> > Amean 21 3.9927 ( 0.00%) 6.8250 * -70.94%*
> > Amean 30 5.5320 ( 0.00%) 8.6433 * -56.24%*
> > Amean 48 8.4723 ( 0.00%) 12.1890 * -43.87%*
> > Amean 79 12.3760 ( 0.00%) 17.8347 * -44.11%*
> > Amean 110 16.0257 ( 0.00%) 23.1373 * -44.38%*
> > Amean 141 20.7070 ( 0.00%) 29.8537 * -44.17%*
> > Amean 172 25.1507 ( 0.00%) 37.4830 * -49.03%*
> > Amean 203 28.5303 ( 0.00%) 43.5220 * -52.55%*
> > Amean 234 33.8233 ( 0.00%) 51.5403 * -52.38%*
> > Amean 265 37.8703 ( 0.00%) 58.1860 * -53.65%*
> > Amean 296 43.8303 ( 0.00%) 64.9223 * -48.12%*
> > Stddev 1 0.0040 ( 0.00%) 0.0117 (-189.97%)
> > Stddev 4 0.0046 ( 0.00%) 0.0766 (-1557.56%)
> > Stddev 7 0.0333 ( 0.00%) 0.0991 (-197.83%)
> > Stddev 12 0.0425 ( 0.00%) 0.1303 (-206.90%)
> > Stddev 21 0.0337 ( 0.00%) 0.4138 (-1127.60%)
> > Stddev 30 0.0295 ( 0.00%) 0.1551 (-424.94%)
> > Stddev 48 0.0445 ( 0.00%) 0.2056 (-361.71%)
> > Stddev 79 0.0350 ( 0.00%) 0.4118 (-1076.56%)
> > Stddev 110 0.0655 ( 0.00%) 0.3685 (-462.72%)
> > Stddev 141 0.3670 ( 0.00%) 0.5488 ( -49.55%)
> > Stddev 172 0.7375 ( 0.00%) 1.0806 ( -46.52%)
> > Stddev 203 0.0817 ( 0.00%) 1.6920 (-1970.11%)
> > Stddev 234 0.8210 ( 0.00%) 1.4036 ( -70.97%)
> > Stddev 265 0.9337 ( 0.00%) 1.1025 ( -18.08%)
> > Stddev 296 1.5688 ( 0.00%) 0.4154 ( 73.52%)
> >
> > The problem with hackbench is that "1" above doesn't represent 1 task,
> > it represents 1 group and so the machine gets saturated relatively
> > quickly and it's super sensitive to cores being idle and available to
> > make quick progress.
>
> hackbench is also super sensitive to the same group of ~20 tasks being
> able to progress at once, and hence is pretty noisy.
>
> The flip-over between hackbench being able to progress effectively and a
> half-scheduled group hindering all the others seems to be super
> non-deterministic and can be triggered by random events both within
> hackbench, and other things happening on the machine.
>
Indeed.
> So while hackbench is somewhat artificial in its intensity and load
> levels, it still matches messaging server peak loads so it's still
> consider it an imporant metric of scheduling quality.
>
Typically I end up using hackbench as a canary. It can detect when
something is wrong, not necessarily that real workloads care.
> I'm wondering whether the scheduler could do anything to reduce the
> non-determinism of hackbench.
>
> BTW., note that 'perf bench scheduling' is a hackbench work-alike:
>
> dagon:~/tip> perf bench sched messaging
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver processes per group
> # 10 groups == 400 processes run
>
> Total time: 0.158 [sec]
>
> It also has a threaded variant (which is a hackbench-pthread work-alike):
>
> dagon:~/tip> perf bench sched messaging --thread --group 20
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver threads per group
> # 20 groups == 800 threads run
>
> Total time: 0.265 [sec]
>
> I'm trying to distill the most important scheduler micro-benchmarks into
> 'perf bench':
>
> dagon:~/tip> perf bench sched
>
> # List of available benchmarks for collection 'sched':
>
> messaging: Benchmark for scheduling and IPC
> pipe: Benchmark for pipe() between two processes
> all: Run all scheduler benchmarks
>
> which is still stuck at a very low count of 2 benchmarks currently.
> :-)
>
FWIW, mmtests does have support for running perf bench for some loads. I
just never converted "hackbench" over to the perf variant because I
didn't want to discard old data. Poor justification I know.
> > Kernel building which is all anyone ever cares about is a mixed bag
> >
> > 1-socket
> > Amean elsp-2 420.45 ( 0.00%) 240.80 * 42.73%*
> > Amean elsp-4 363.54 ( 0.00%) 135.09 * 62.84%*
> > Amean elsp-8 105.40 ( 0.00%) 131.46 * -24.73%*
> > Amean elsp-16 106.61 ( 0.00%) 133.57 * -25.29%*
> >
> > 2-socket
> > Amean elsp-2 406.76 ( 0.00%) 448.57 ( -10.28%)
> > Amean elsp-4 235.22 ( 0.00%) 289.48 ( -23.07%)
> > Amean elsp-8 152.36 ( 0.00%) 116.76 ( 23.37%)
> > Amean elsp-16 64.50 ( 0.00%) 52.12 * 19.20%*
> > Amean elsp-32 30.28 ( 0.00%) 28.24 * 6.74%*
> > Amean elsp-64 21.67 ( 0.00%) 23.00 * -6.13%*
> > Amean elsp-128 20.57 ( 0.00%) 23.57 * -14.60%*
> > Amean elsp-160 20.64 ( 0.00%) 23.63 * -14.50%*
> > Stddev elsp-2 75.35 ( 0.00%) 35.00 ( 53.55%)
> > Stddev elsp-4 71.12 ( 0.00%) 86.09 ( -21.05%)
> > Stddev elsp-8 43.05 ( 0.00%) 10.67 ( 75.22%)
> > Stddev elsp-16 4.08 ( 0.00%) 2.31 ( 43.41%)
> > Stddev elsp-32 0.51 ( 0.00%) 0.76 ( -48.60%)
> > Stddev elsp-64 0.38 ( 0.00%) 0.61 ( -60.72%)
> > Stddev elsp-128 0.13 ( 0.00%) 0.41 (-207.53%)
> > Stddev elsp-160 0.08 ( 0.00%) 0.20 (-147.93%)
> >
> > 1-socket matches other patterns, the 2-socket was weird. Variability was
> > nuts for low number of jobs. It's also not universal. I had tested in a
> > 2-socket Haswell machine and it showed different results
> >
> > Amean elsp-2 447.91 ( 0.00%) 467.43 ( -4.36%)
> > Amean elsp-4 284.47 ( 0.00%) 248.37 ( 12.69%)
> > Amean elsp-8 166.20 ( 0.00%) 129.23 ( 22.24%)
> > Amean elsp-16 63.89 ( 0.00%) 55.63 * 12.93%*
> > Amean elsp-32 36.80 ( 0.00%) 35.87 * 2.54%*
> > Amean elsp-64 30.97 ( 0.00%) 36.94 * -19.28%*
> > Amean elsp-96 31.66 ( 0.00%) 37.32 * -17.89%*
> > Stddev elsp-2 58.08 ( 0.00%) 57.93 ( 0.25%)
> > Stddev elsp-4 65.31 ( 0.00%) 41.56 ( 36.36%)
> > Stddev elsp-8 68.32 ( 0.00%) 15.61 ( 77.15%)
> > Stddev elsp-16 3.68 ( 0.00%) 2.43 ( 33.87%)
> > Stddev elsp-32 0.29 ( 0.00%) 0.97 (-239.75%)
> > Stddev elsp-64 0.36 ( 0.00%) 0.24 ( 32.10%)
> > Stddev elsp-96 0.30 ( 0.00%) 0.31 ( -5.11%)
> >
> > Still not a perfect match to the general pattern for 2 build jobs and a
> > bit variable but otherwise the pattern holds -- performs better until the
> > machine is saturated. Kernel builds (or compilation builds) are always a
> > bit off as a benchmark as it has a mix of parallel and serialised tasks
> > that are non-deterministic.
>
> Interesting.
>
> Here too I'm wondering whether the scheduler could do something to
> improve the saturated case: which *is* an important workload, as kernel
> hackers tend to over-load their systems a bit when building kernel, to
> make sure the system is at least 100% utilized. ;-)
>
Every so often I try but I never managed to settle on a heuristic that
helped this case without breaking others. The biggest hurdle is that
typically things are better if migrations are low but it's hard to do
that in a way that does not also stack tasks on the same CPUs prematurely.
> > ep is the embarassingly parallel problem and it shows with half the cores
> > with HT off, we take a 38.76% performance hit. However, even that is not
> > universally true as cg for example did not parallelise as well and only
> > performacne 4.42% worse even with HT off.
>
> Very interesting. I'm wondering what kind of workload 'ep' is exactly,
> and would love to have a work-alike in 'perf sched bench'.
>
I never looked too closely. It's characterised in the paper "THE NAS
PARALLEL BENCHMARKS" as follows;
An embarrassingly parallel kernel. It provides an estimate of
the upper achievable limits for floating point performance, i.e.,
the performance without significant interprocessor communication.
> Do these benchmarks over-saturate by default, and is this really
> representative of how all the large compute cluster folks are *using*
> MPI?
>
No, they don't. They are configured with a thread count with some
limitations if MPI is used (some problems require the degree of
parallelisation to be a power-of-two for example). In this case I compared
a "full" configuration for HT Off against a "half" configuration for HT
On so that both configurations used the same number of cores.
> I thought the more common pattern was to closely tailor MPI parallelism
> to available (logical) cores parallelism, to minimize shared cache
> trashing in an oversubscribed scenario, but I could be wrong.
>
It is although it depends on the exact application, but in this test I
didn't do a setup like that.
> > I can show a comparison with equal levels of parallelisation but with
> > HT off, it is a completely broken configuration and I do not think a
> > comparison like that makes any sense.
>
> I would still be interested in that comparison, because I'd like
> to learn whether there's any true *inherent* performance advantage to
> HyperThreading for that particular workload, for exactly tuned
> parallelism.
>
It really isn't a fair comparison. MPI seems to behave very differently
when a machine is saturated. It's documented as changing its behaviour
as it tries to avoid the worst consequences of saturation.
Curiously, the results on the 2-socket machine were not as bad as I
feared when the HT configuration is running with twice the number of
threads as there are CPUs
Amean bt 771.15 ( 0.00%) 1086.74 * -40.93%*
Amean cg 445.92 ( 0.00%) 543.41 * -21.86%*
Amean ep 70.01 ( 0.00%) 96.29 * -37.53%*
Amean is 16.75 ( 0.00%) 21.19 * -26.51%*
Amean lu 882.84 ( 0.00%) 595.14 * 32.59%*
Amean mg 84.10 ( 0.00%) 80.02 * 4.84%*
Amean sp 1353.88 ( 0.00%) 1384.10 * -2.23%*
> Even if nobody is going to run the NPB/NAS benchmark that way.
>
> > I didn't do any comparison that could represent Cloud. However, I think
> > it's worth noting that HT may be popular there for packing lots of virtual
> > machines onto a single host and over-subscribing. HT would intuitively
> > have an advantage there *but* it depends heavily on the utilisation and
> > whether there is sustained VCPU activity where the number of active VCPUs
> > exceeds physical CPUs when HT is off. There is also the question whether
> > performance even matters on such configurations but anything cloud related
> > will be "how long is a piece of string" and "it depends".
>
> Intuitively I'd guess that because all the cloud providers are pushing
> for core-sched HT is probably a win in cloud benchmarks, if not for the
> pesky security problems. ;-)
>
Indeed. When it gets down to it, I expect they have better data on what
average utilisation of physical cores are as a ratio to vcpus.
> > So there you have it, HT Off is not a guaranteed loss and can be a gain
> > so it should be considered as an alternative to core scheduling. The case
> > where HT makes a big difference is when a workload is CPU or memory bound
> > and the number of active tasks exceeds the number of CPUs on a socket
> > and again when number of active tasks exceeds the number of CPUs in the
> > whole machine.
>
> Fascinating measurements, thanks a lot Mel for doing these!
>
My pleasure!
--
Mel Gorman
SUSE Labs
On Thu, Apr 25, 2019 at 5:55 PM Ingo Molnar <[email protected]> wrote:
>
>
> * Aubrey Li <[email protected]> wrote:
>
> > On Wed, Apr 24, 2019 at 10:00 PM Julien Desfossez
> > <[email protected]> wrote:
> > >
> > > On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> > > > On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> > > > <[email protected]> wrote:
> > > > >
> > > > > Second iteration of the core-scheduling feature.
> > > > >
> > > > > This version fixes apparent bugs and performance issues in v1. This
> > > > > doesn't fully address the issue of core sharing between processes
> > > > > with different tags. Core sharing still happens 1% to 5% of the time
> > > > > based on the nature of workload and timing of the runnable processes.
> > > > >
> > > > > Changes in v2
> > > > > -------------
> > > > > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> > > >
> > > > Thanks to post v2, based on this version, here is my benchmarks result.
> > > >
> > > > Environment setup
> > > > --------------------------
> > > > Skylake server, 2 numa nodes, 104 CPUs (HT on)
> > > > cgroup1 workload, sysbench (CPU intensive non AVX workload)
> > > > cgroup2 workload, gemmbench (AVX512 workload)
> > > >
> > > > Case 1: task number < CPU num
> > > > --------------------------------------------
> > > > 36 sysbench threads in cgroup1
> > > > 36 gemmbench threads in cgroup2
> > > >
> > > > core sched off:
> > > > - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> > > > core sched on:
> > > > - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
> > > >
> > > > Due to core cookie matching, sysbench tasks won't be affect by AVX512
> > > > tasks, latency has ~28% improvement!!!
> > > >
> > > > Case 2: task number > CPU number
> > > > -------------------------------------------------
> > > > 72 sysbench threads in cgroup1
> > > > 72 gemmbench threads in cgroup2
> > > >
> > > > core sched off:
> > > > - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> > > > core sched on:
> > > > - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
> > > >
> > > > So not only power, now security and performance is a pair of contradictions.
> > > > Due to core cookie not matching and forced idle introduced, latency has ~12%
> > > > regression.
> > > >
> > > > Any comments?
> > >
> > > Would it be possible to post the results with HT off as well ?
> >
> > What's the point here to turn HT off? The latency is sensitive to the
> > relationship
> > between the task number and CPU number. Usually less CPU number, more run
> > queue wait time, and worse result.
>
> HT-off numbers are mandatory: turning HT off is by far the simplest way
> to solve the security bugs in these CPUs.
>
> Any core-scheduling solution *must* perform better than HT-off for all
> relevant workloads, otherwise what's the point?
>
Got it, I'll measure HT-off cases soon.
Thanks,
-Aubrey
* Mel Gorman <[email protected]> wrote:
> > > Same -- performance is better until the machine gets saturated and
> > > disabling HT hits scaling limits earlier.
> >
> > Interesting. This strongly suggests sub-optimal SMT-scheduling in the
> > non-saturated HT case, i.e. a scheduler balancing bug.
> >
>
> Yeah, it does but mpstat didn't appear to indicate that SMT siblings are
> being used prematurely so it's a bit of a curiousity.
>
> > As long as loads are clearly below the physical cores count (which they
> > are in the early phases of your table) the scheduler should spread tasks
> > without overlapping two tasks on the same core.
> >
>
> It should, but it's not perfect. For example, wake_affine_idle does not
> take sibling activity into account even though select_idle_sibling *may*
> take it into account. Even select_idle_sibling in its fast path may use
> an SMT sibling instead of searching.
>
> There are also potential side-effects with cpuidle. Some workloads
> migration around the socket as they are communicating because of how the
> search for an idle CPU works. With SMT on, there is potentially a longer
> opportunity for a core to reach a deep c-state and incur a bigger wakeup
> latency. This is a very weak theory but I've seen cases where latency
> sensitive workloads with only two communicating tasks are affected by
> CPUs reaching low c-states due to migrations.
>
> > Clearly it doesn't.
> >
>
> It's more that it's best effort to wakeup quickly instead of being perfect
> by using an expensive search every time.
Yeah, but your numbers suggest that for *most* not heavily interacting
under-utilized CPU bound workloads we hurt in the 5-10% range compared to
no-SMT - more in some cases.
So we avoid a maybe 0.1% scheduler placement overhead but inflict 5-10%
harm on the workload, and also blow up stddev by randomly co-scheduling
two tasks on the same physical core? Not a good trade-off.
I really think we should implement a relatively strict physical core
placement policy in the under-utilized case, and resist any attempts to
weaken this for special workloads that ping-pong quickly and benefit from
sharing the same physical core.
I.e. as long as load is kept below ~50% the SMT and !SMT benchmark
results and stddev numbers should match up. (With a bit of a leewy if the
workload gets near to 50% or occasionally goes above it.)
There's absolutely no excluse for these numbers at 30-40% load levels I
think.
Thanks,
Ingo
* Mel Gorman <[email protected]> wrote:
> > > I can show a comparison with equal levels of parallelisation but with
> > > HT off, it is a completely broken configuration and I do not think a
> > > comparison like that makes any sense.
> >
> > I would still be interested in that comparison, because I'd like
> > to learn whether there's any true *inherent* performance advantage to
> > HyperThreading for that particular workload, for exactly tuned
> > parallelism.
> >
>
> It really isn't a fair comparison. MPI seems to behave very differently
> when a machine is saturated. It's documented as changing its behaviour
> as it tries to avoid the worst consequences of saturation.
>
> Curiously, the results on the 2-socket machine were not as bad as I
> feared when the HT configuration is running with twice the number of
> threads as there are CPUs
>
> Amean bt 771.15 ( 0.00%) 1086.74 * -40.93%*
> Amean cg 445.92 ( 0.00%) 543.41 * -21.86%*
> Amean ep 70.01 ( 0.00%) 96.29 * -37.53%*
> Amean is 16.75 ( 0.00%) 21.19 * -26.51%*
> Amean lu 882.84 ( 0.00%) 595.14 * 32.59%*
> Amean mg 84.10 ( 0.00%) 80.02 * 4.84%*
> Amean sp 1353.88 ( 0.00%) 1384.10 * -2.23%*
Yeah, so what I wanted to suggest is a parallel numeric throughput test
with few inter-process data dependencies, and see whether HT actually
improves total throughput versus the no-HT case.
No over-saturation - but exactly as many threads as logical CPUs.
I.e. with 20 physical cores and 40 logical CPUs the numbers to compare
would be a 'nosmt' benchmark running 20 threads, versus a SMT test
running 40 threads.
I.e. how much does SMT improve total throughput when the workload's
parallelism is tuned to utilize 100% of the available CPUs?
Does this make sense?
Thanks,
Ingo
* Aubrey Li <[email protected]> wrote:
> On Thu, Apr 25, 2019 at 5:55 PM Ingo Molnar <[email protected]> wrote:
> >
> >
> > * Aubrey Li <[email protected]> wrote:
> >
> > > On Wed, Apr 24, 2019 at 10:00 PM Julien Desfossez
> > > <[email protected]> wrote:
> > > >
> > > > On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> > > > > On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> > > > > <[email protected]> wrote:
> > > > > >
> > > > > > Second iteration of the core-scheduling feature.
> > > > > >
> > > > > > This version fixes apparent bugs and performance issues in v1. This
> > > > > > doesn't fully address the issue of core sharing between processes
> > > > > > with different tags. Core sharing still happens 1% to 5% of the time
> > > > > > based on the nature of workload and timing of the runnable processes.
> > > > > >
> > > > > > Changes in v2
> > > > > > -------------
> > > > > > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> > > > >
> > > > > Thanks to post v2, based on this version, here is my benchmarks result.
> > > > >
> > > > > Environment setup
> > > > > --------------------------
> > > > > Skylake server, 2 numa nodes, 104 CPUs (HT on)
> > > > > cgroup1 workload, sysbench (CPU intensive non AVX workload)
> > > > > cgroup2 workload, gemmbench (AVX512 workload)
> > > > >
> > > > > Case 1: task number < CPU num
> > > > > --------------------------------------------
> > > > > 36 sysbench threads in cgroup1
> > > > > 36 gemmbench threads in cgroup2
> > > > >
> > > > > core sched off:
> > > > > - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> > > > > core sched on:
> > > > > - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
> > > > >
> > > > > Due to core cookie matching, sysbench tasks won't be affect by AVX512
> > > > > tasks, latency has ~28% improvement!!!
> > > > >
> > > > > Case 2: task number > CPU number
> > > > > -------------------------------------------------
> > > > > 72 sysbench threads in cgroup1
> > > > > 72 gemmbench threads in cgroup2
> > > > >
> > > > > core sched off:
> > > > > - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> > > > > core sched on:
> > > > > - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
> > > > >
> > > > > So not only power, now security and performance is a pair of contradictions.
> > > > > Due to core cookie not matching and forced idle introduced, latency has ~12%
> > > > > regression.
> > > > >
> > > > > Any comments?
> > > >
> > > > Would it be possible to post the results with HT off as well ?
> > >
> > > What's the point here to turn HT off? The latency is sensitive to the
> > > relationship
> > > between the task number and CPU number. Usually less CPU number, more run
> > > queue wait time, and worse result.
> >
> > HT-off numbers are mandatory: turning HT off is by far the simplest way
> > to solve the security bugs in these CPUs.
> >
> > Any core-scheduling solution *must* perform better than HT-off for all
> > relevant workloads, otherwise what's the point?
> >
> Got it, I'll measure HT-off cases soon.
Thanks!
Ingo
* Mel Gorman <[email protected]> wrote:
> > Interesting.
> >
> > Here too I'm wondering whether the scheduler could do something to
> > improve the saturated case: which *is* an important workload, as kernel
> > hackers tend to over-load their systems a bit when building kernel, to
> > make sure the system is at least 100% utilized. ;-)
> >
>
> Every so often I try but I never managed to settle on a heuristic that
> helped this case without breaking others. The biggest hurdle is that
> typically things are better if migrations are low but it's hard to do
> that in a way that does not also stack tasks on the same CPUs
> prematurely.
So instead of using a heuristic (which are fragile and most of them are
also annoyingly non-deterministic and increase overall noise and make
measurements harder) I'd suggest using SCHED_BATCH just as a hardcore
toggle to maximize for CPU-bound throughput.
It's not used very much, but the kernel build could use it by default
(i.e. we could use a "chrt -b" call within the main Makefile), so it
would be the perfect guinea pig and wouldn't affect anything else.
I.e. we could use SCHED_BATCH to maximize kernel build speed, with no no
regard to latency (within SCHED_BATCH workloads). I suspect this will
also maximize bandwidth of a lot of other real-world, highly parallel but
interacting processing workloads.
[ I'd even be willing to rename it to SCHED_KBUILD, internally. ;-) ]
Thanks,
Ingo
On Fri, Apr 26, 2019 at 11:45:45AM +0200, Ingo Molnar wrote:
>
> * Mel Gorman <[email protected]> wrote:
>
> > > > I can show a comparison with equal levels of parallelisation but with
> > > > HT off, it is a completely broken configuration and I do not think a
> > > > comparison like that makes any sense.
> > >
> > > I would still be interested in that comparison, because I'd like
> > > to learn whether there's any true *inherent* performance advantage to
> > > HyperThreading for that particular workload, for exactly tuned
> > > parallelism.
> > >
> >
> > It really isn't a fair comparison. MPI seems to behave very differently
> > when a machine is saturated. It's documented as changing its behaviour
> > as it tries to avoid the worst consequences of saturation.
> >
> > Curiously, the results on the 2-socket machine were not as bad as I
> > feared when the HT configuration is running with twice the number of
> > threads as there are CPUs
> >
> > Amean bt 771.15 ( 0.00%) 1086.74 * -40.93%*
> > Amean cg 445.92 ( 0.00%) 543.41 * -21.86%*
> > Amean ep 70.01 ( 0.00%) 96.29 * -37.53%*
> > Amean is 16.75 ( 0.00%) 21.19 * -26.51%*
> > Amean lu 882.84 ( 0.00%) 595.14 * 32.59%*
> > Amean mg 84.10 ( 0.00%) 80.02 * 4.84%*
> > Amean sp 1353.88 ( 0.00%) 1384.10 * -2.23%*
>
> Yeah, so what I wanted to suggest is a parallel numeric throughput test
> with few inter-process data dependencies, and see whether HT actually
> improves total throughput versus the no-HT case.
>
> No over-saturation - but exactly as many threads as logical CPUs.
>
> I.e. with 20 physical cores and 40 logical CPUs the numbers to compare
> would be a 'nosmt' benchmark running 20 threads, versus a SMT test
> running 40 threads.
>
> I.e. how much does SMT improve total throughput when the workload's
> parallelism is tuned to utilize 100% of the available CPUs?
>
> Does this make sense?
>
Yes. Here is the comparison.
Amean bt 678.75 ( 0.00%) 789.13 * -16.26%*
Amean cg 261.22 ( 0.00%) 428.82 * -64.16%*
Amean ep 55.36 ( 0.00%) 84.41 * -52.48%*
Amean is 13.25 ( 0.00%) 17.82 * -34.47%*
Amean lu 1065.08 ( 0.00%) 1090.44 ( -2.38%)
Amean mg 89.96 ( 0.00%) 84.28 * 6.31%*
Amean sp 1579.52 ( 0.00%) 1506.16 * 4.64%*
Amean ua 611.87 ( 0.00%) 663.26 * -8.40%*
This is the socket machine and with HT On, there are 80 logical CPUs
versus HT Off with 40 logical CPUs.
--
Mel Gorman
SUSE Labs
On Fri, Apr 26, 2019 at 10:42:22AM +0200, Ingo Molnar wrote:
> > It should, but it's not perfect. For example, wake_affine_idle does not
> > take sibling activity into account even though select_idle_sibling *may*
> > take it into account. Even select_idle_sibling in its fast path may use
> > an SMT sibling instead of searching.
> >
> > There are also potential side-effects with cpuidle. Some workloads
> > migration around the socket as they are communicating because of how the
> > search for an idle CPU works. With SMT on, there is potentially a longer
> > opportunity for a core to reach a deep c-state and incur a bigger wakeup
> > latency. This is a very weak theory but I've seen cases where latency
> > sensitive workloads with only two communicating tasks are affected by
> > CPUs reaching low c-states due to migrations.
> >
> > > Clearly it doesn't.
> > >
> >
> > It's more that it's best effort to wakeup quickly instead of being perfect
> > by using an expensive search every time.
>
> Yeah, but your numbers suggest that for *most* not heavily interacting
> under-utilized CPU bound workloads we hurt in the 5-10% range compared to
> no-SMT - more in some cases.
>
Indeed, it was higher than expected and we can't even use the excuse that
more resources are available to a single logical CPU as the scheduler is
meant to keep them apart.
> So we avoid a maybe 0.1% scheduler placement overhead but inflict 5-10%
> harm on the workload, and also blow up stddev by randomly co-scheduling
> two tasks on the same physical core? Not a good trade-off.
>
> I really think we should implement a relatively strict physical core
> placement policy in the under-utilized case, and resist any attempts to
> weaken this for special workloads that ping-pong quickly and benefit from
> sharing the same physical core.
>
It's worth a shot at least. Changes should mostly be in the wake_affine
path for most loads of interest.
> I.e. as long as load is kept below ~50% the SMT and !SMT benchmark
> results and stddev numbers should match up. (With a bit of a leewy if the
> workload gets near to 50% or occasionally goes above it.)
>
> There's absolutely no excluse for these numbers at 30-40% load levels I
> think.
>
Agreed. I'll put it on the todo list but there is no way I'll get to it
in the short term due to LSF/MM. Minimally I'll put some thought into
tooling to track how often siblings are used with some reporting on when a
sibling was used when there was an idle core available. That'll at least
quantify the problem and verify the hypothesis.
--
Mel Gorman
SUSE Labs
On Tue, Apr 23, 2019 at 04:18:14PM +0000, Vineeth Remanan Pillai wrote:
> +static struct task_struct *
> +pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = &rq->cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);
> +
> + if (!(se || curr))
> + return NULL;
I didn't get around to reading the original discussion here, but how can
that possibly be?
I can see !se, in that case curr is still selected.
I can also see !curr, in that case curr is put.
But I cannot see !se && !curr, per the above check we know
cfs_rq->nr_running != 0, so there must be a cfs task to find. This means
either curr or se must exist.
> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +
> + if (!se || entity_before(curr, se))
> + se = curr;
> + }
> +
> + cfs_rq = group_cfs_rq(se);
> + } while (cfs_rq);
> +
> + return task_of(se);
> +}
On Thu, Apr 25, 2019 at 08:53:43PM +0200 Ingo Molnar wrote:
> Interesting. This strongly suggests sub-optimal SMT-scheduling in the
> non-saturated HT case, i.e. a scheduler balancing bug.
>
> As long as loads are clearly below the physical cores count (which they
> are in the early phases of your table) the scheduler should spread tasks
> without overlapping two tasks on the same core.
>
> Clearly it doesn't.
>
That's especially true if there are cgroups with different numbers of
tasks in them involved.
Here's an example showing the average number of tasks on each of the 4 numa
nodes during a test run. 20 cpus per node. There are 78 threads total, 76
for lu and 2 stress cpu hogs. So fewer than the 80 CPUs on the box. The GROUP
test has the two stresses and lu in distinct cgroups. The NORMAL test has them
all in one. This is from 5.0-rc3+, but the version doesn't matter. It's
reproducible on any kernel. SMT is on, but that also doesn't matter here.
The first two lines show where the stress jobs ran and the second show where
the 76 threads of lu ran.
GROUP_1.stress.ps.numa.hist Average 1.00 1.00
NORMAL_1.stress.ps.numa.hist Average 0.00 1.10 0.90
lu.C.x_76_GROUP_1.ps.numa.hist Average 10.97 11.78 26.28 26.97
lu.C.x_76_NORMAL_1.ps.numa.hist Average 19.70 18.70 17.80 19.80
The NORMAL test is evenly balanced across the 20 cpus per numa node. There
is between a 4x and 10x performance hit to the lu benchmark between group
and normal in any of these test runs. In this particular case it was 10x:
============76_GROUP========Mop/s===================================
min q1 median q3 max
3776.51 3776.51 3776.51 3776.51 3776.51
============76_GROUP========time====================================
min q1 median q3 max
539.92 539.92 539.92 539.92 539.92
============76_NORMAL========Mop/s===================================
min q1 median q3 max
39386 39386 39386 39386 39386
============76_NORMAL========time====================================
min q1 median q3 max
51.77 51.77 51.77 51.77 51.77
This a bit off topic, but since balancing bugs was mentioned and I've been
trying to track this down for a while (and learning the scheduler code in
the process) I figured I'd just throw it out there :)
Cheers,
Phil
--
On Thu, Apr 25, 2019 at 10:26:53AM -0400, Phil Auld wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e8e5f26db052..b312ea1e28a4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7541,6 +7541,9 @@ static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype
> if (val > 1)
> return -ERANGE;
>
> + if (num_online_cpus() <= 1)
> + return -EINVAL;
We actually know if there SMT on the system or not, which is much better
indication still:
if (!static_branch_likely(&sched_smt_present))
return -EINVAL;
> if (tg->tagged == !!val)
> return 0;
>
>
>
>
> --
On Fri, Apr 26, 2019 at 04:13:07PM +0200 Peter Zijlstra wrote:
> On Thu, Apr 25, 2019 at 10:26:53AM -0400, Phil Auld wrote:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index e8e5f26db052..b312ea1e28a4 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -7541,6 +7541,9 @@ static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype
> > if (val > 1)
> > return -ERANGE;
> >
> > + if (num_online_cpus() <= 1)
> > + return -EINVAL;
>
> We actually know if there SMT on the system or not, which is much better
> indication still:
>
> if (!static_branch_likely(&sched_smt_present))
> return -EINVAL;
>
> > if (tg->tagged == !!val)
> > return 0;
> >
> >
> >
> >
> > --
Yeah, I thought there was probably a better check.
Thanks!
--
On Tue, Apr 23, 2019 at 04:18:21PM +0000, Vineeth Remanan Pillai wrote:
(you lost From: Julien)
> During core scheduling, it can happen that the current rq selects a
> non-tagged process while the sibling might be idling even though it
> had something to run (because the sibling selected idle to match the
> tagged process in previous tag matching iteration). We need to wake up
> the sibling if such a situation arise.
>
> Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> Signed-off-by: Julien Desfossez <[email protected]>
> ---
> kernel/sched/core.c | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e8f5ec641d0a..0e3c51a1b54a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3775,6 +3775,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> */
> if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
> next = p;
> + rq->core_pick = NULL;
> +
> + /*
> + * If the sibling is idling, we might want to wake it
> + * so that it can check for any runnable tasks that did
> + * not get a chance to run due to previous task matching.
> + */
> + for_each_cpu(j, smt_mask) {
> + struct rq *rq_j = cpu_rq(j);
> + rq_j->core_pick = NULL;
> + if (j != cpu &&
> + is_idle_task(rq_j->curr) && rq_j->nr_running) {
> + resched_curr(rq_j);
> + }
> + }
> goto done;
> }
Anyway, as written here:
https://lkml.kernel.org/r/[email protected]
I think this isn't quite right. Does the below patch (which actually
removes lines) also work?
As written before; the intent was to not allow that optimization if the
last pick had a cookie; thereby doing a (last) core wide selection when
we go to a 0-cookie, and this then includes kicking forced-idle cores.
---
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3574,18 +3574,7 @@ static struct task_struct *
pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
{
struct task_struct *class_pick, *cookie_pick;
- unsigned long cookie = 0UL;
-
- /*
- * We must not rely on rq->core->core_cookie here, because we fail to reset
- * rq->core->core_cookie on new picks, such that we can detect if we need
- * to do single vs multi rq task selection.
- */
-
- if (max && max->core_cookie) {
- WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
- cookie = max->core_cookie;
- }
+ unsigned long cookie = rq->core->core_cookie;
class_pick = class->pick_task(rq);
if (!cookie)
@@ -3612,6 +3601,7 @@ pick_next_task(struct rq *rq, struct tas
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+ unsigned long prev_cookie;
int i, j, cpu;
if (!sched_core_enabled(rq))
@@ -3653,7 +3643,10 @@ pick_next_task(struct rq *rq, struct tas
*/
rq->core->core_task_seq++;
+ prev_cookie = rq->core->core_cookie;
+
/* reset state */
+ rq->core->core_cookie = 0UL;
for_each_cpu(i, smt_mask) {
struct rq *rq_i = cpu_rq(i);
@@ -3688,7 +3681,7 @@ pick_next_task(struct rq *rq, struct tas
* If there weren't no cookies; we don't need
* to bother with the other siblings.
*/
- if (i == cpu && !rq->core->core_cookie)
+ if (i == cpu && !prev_cookie)
goto next_class;
continue;
@@ -3698,7 +3691,7 @@ pick_next_task(struct rq *rq, struct tas
* Optimize the 'normal' case where there aren't any
* cookies and we don't need to sync up.
*/
- if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
+ if (i == cpu && !prev_cookie && !p->core_cookie) {
next = p;
goto done;
}
>
> I didn't get around to reading the original discussion here, but how can
> that possibly be?
>
> I can see !se, in that case curr is still selected.
>
> I can also see !curr, in that case curr is put.
>
> But I cannot see !se && !curr, per the above check we know
> cfs_rq->nr_running != 0, so there must be a cfs task to find. This means
> either curr or se must exist.
This fix was suggested as a quick fix for a crash seen in v1. But
I agree with you that this should be a bug if it happens and should
be investigated. I have tried in v2 and can no longer reproduce the
crash. Will remove the check in v3.
Thanks
On 4/26/19 3:43 AM, Mel Gorman wrote:
> On Fri, Apr 26, 2019 at 10:42:22AM +0200, Ingo Molnar wrote:
>>> It should, but it's not perfect. For example, wake_affine_idle does not
>>> take sibling activity into account even though select_idle_sibling *may*
>>> take it into account. Even select_idle_sibling in its fast path may use
>>> an SMT sibling instead of searching.
>>>
>>> There are also potential side-effects with cpuidle. Some workloads
>>> migration around the socket as they are communicating because of how the
>>> search for an idle CPU works. With SMT on, there is potentially a longer
>>> opportunity for a core to reach a deep c-state and incur a bigger wakeup
>>> latency. This is a very weak theory but I've seen cases where latency
>>> sensitive workloads with only two communicating tasks are affected by
>>> CPUs reaching low c-states due to migrations.
>>>
>>>> Clearly it doesn't.
>>>>
>>> It's more that it's best effort to wakeup quickly instead of being perfect
>>> by using an expensive search every time.
>> Yeah, but your numbers suggest that for *most* not heavily interacting
>> under-utilized CPU bound workloads we hurt in the 5-10% range compared to
>> no-SMT - more in some cases.
>>
> Indeed, it was higher than expected and we can't even use the excuse that
> more resources are available to a single logical CPU as the scheduler is
> meant to keep them apart.
>
>> So we avoid a maybe 0.1% scheduler placement overhead but inflict 5-10%
>> harm on the workload, and also blow up stddev by randomly co-scheduling
>> two tasks on the same physical core? Not a good trade-off.
>>
>> I really think we should implement a relatively strict physical core
>> placement policy in the under-utilized case, and resist any attempts to
>> weaken this for special workloads that ping-pong quickly and benefit from
>> sharing the same physical core.
>>
> It's worth a shot at least. Changes should mostly be in the wake_affine
> path for most loads of interest.
Doesn't select_idle_sibling already try to do that by calling
select_idle_core? For our OLTP workload we infact found the cost of
select_idle_core was actually hurting more than it helped to find a fully
idle core, so a net negative.
On Fri, Apr 26, 2019 at 11:37:11AM -0700, Subhra Mazumdar wrote:
> > > So we avoid a maybe 0.1% scheduler placement overhead but inflict 5-10%
> > > harm on the workload, and also blow up stddev by randomly co-scheduling
> > > two tasks on the same physical core? Not a good trade-off.
> > >
> > > I really think we should implement a relatively strict physical core
> > > placement policy in the under-utilized case, and resist any attempts to
> > > weaken this for special workloads that ping-pong quickly and benefit from
> > > sharing the same physical core.
> > >
> > It's worth a shot at least. Changes should mostly be in the wake_affine
> > path for most loads of interest.
>
> Doesn't select_idle_sibling already try to do that by calling
> select_idle_core? For our OLTP workload we infact found the cost of
> select_idle_core was actually hurting more than it helped to find a fully
> idle core, so a net negative.
>
select_idle_sibling is not guarnateed to call select_idle_core or avoid
selecting HT sibling whose other sibling is !idle but yes, in that path,
the search cost is a general concern which is why any change there is
tricky at best.
--
Mel Gorman
SUSE Labs
On Thu, Apr 25, 2019 at 5:55 PM Ingo Molnar <[email protected]> wrote:
> * Aubrey Li <[email protected]> wrote:
> > On Wed, Apr 24, 2019 at 10:00 PM Julien Desfossez
> > <[email protected]> wrote:
> > >
> > > On 24-Apr-2019 09:13:10 PM, Aubrey Li wrote:
> > > > On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> > > > <[email protected]> wrote:
> > > > >
> > > > > Second iteration of the core-scheduling feature.
> > > > >
> > > > > This version fixes apparent bugs and performance issues in v1. This
> > > > > doesn't fully address the issue of core sharing between processes
> > > > > with different tags. Core sharing still happens 1% to 5% of the time
> > > > > based on the nature of workload and timing of the runnable processes.
> > > > >
> > > > > Changes in v2
> > > > > -------------
> > > > > - rebased on mainline commit: 6d906f99817951e2257d577656899da02bb33105
> > > >
> > > > Thanks to post v2, based on this version, here is my benchmarks result.
> > > >
> > > > Environment setup
> > > > --------------------------
> > > > Skylake server, 2 numa nodes, 104 CPUs (HT on)
> > > > cgroup1 workload, sysbench (CPU intensive non AVX workload)
> > > > cgroup2 workload, gemmbench (AVX512 workload)
> > > >
> > > > Case 1: task number < CPU num
> > > > --------------------------------------------
> > > > 36 sysbench threads in cgroup1
> > > > 36 gemmbench threads in cgroup2
> > > >
> > > > core sched off:
> > > > - sysbench 95th percentile latency(ms): avg = 4.952, stddev = 0.55342
> > > > core sched on:
> > > > - sysbench 95th percentile latency(ms): avg = 3.549, stddev = 0.04449
> > > >
> > > > Due to core cookie matching, sysbench tasks won't be affect by AVX512
> > > > tasks, latency has ~28% improvement!!!
> > > >
> > > > Case 2: task number > CPU number
> > > > -------------------------------------------------
> > > > 72 sysbench threads in cgroup1
> > > > 72 gemmbench threads in cgroup2
> > > >
> > > > core sched off:
> > > > - sysbench 95th percentile latency(ms): avg = 11.914, stddev = 3.259
> > > > core sched on:
> > > > - sysbench 95th percentile latency(ms): avg = 13.289, stddev = 4.863
> > > >
> > > > So not only power, now security and performance is a pair of contradictions.
> > > > Due to core cookie not matching and forced idle introduced, latency has ~12%
> > > > regression.
> > > >
> > > > Any comments?
> > >
> > > Would it be possible to post the results with HT off as well ?
> >
> > What's the point here to turn HT off? The latency is sensitive to the
> > relationship
> > between the task number and CPU number. Usually less CPU number, more run
> > queue wait time, and worse result.
>
> HT-off numbers are mandatory: turning HT off is by far the simplest way
> to solve the security bugs in these CPUs.
>
> Any core-scheduling solution *must* perform better than HT-off for all
> relevant workloads, otherwise what's the point?
>
I have the same environment setup above, for nosmt cases, I used
/sys interface Thomas mentioned, below is the result:
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
1/1 1.987( 1.97%) 2.043( 1.76%) -2.84% 1.985( 1.70%) 0.12%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
2/2 2.074( 1.16%) 2.057( 2.09%) 0.81% 2.072( 0.77%) 0.10%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
4/4 2.140( 0.00%) 2.138( 0.49%) 0.09% 2.137( 0.89%) 0.12%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
8/8 2.140( 0.00%) 2.144( 0.53%) -0.17% 2.140( 0.00%) 0.00%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
16/16 2.361( 2.99%) 2.369( 2.65%) -0.30% 2.406( 2.53%) -1.87%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
32/32 5.032( 8.68%) 3.485( 0.49%) 30.76% 6.002(27.21%) -19.27%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
64/64 7.577(34.35%) 3.972(23.18%) 47.57% 18.235(14.14%) -140.68%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
128/128 24.639(14.28%) 27.440( 8.24%) -11.37% 34.746( 6.92%) -41.02%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
256/256 38.797( 8.59%) 44.067(16.20%) -13.58% 42.536( 7.57%) -9.64%
Thanks,
-Aubrey
* Mel Gorman <[email protected]> wrote:
> On Fri, Apr 26, 2019 at 11:45:45AM +0200, Ingo Molnar wrote:
> >
> > * Mel Gorman <[email protected]> wrote:
> >
> > > > > I can show a comparison with equal levels of parallelisation but with
> > > > > HT off, it is a completely broken configuration and I do not think a
> > > > > comparison like that makes any sense.
> > > >
> > > > I would still be interested in that comparison, because I'd like
> > > > to learn whether there's any true *inherent* performance advantage to
> > > > HyperThreading for that particular workload, for exactly tuned
> > > > parallelism.
> > > >
> > >
> > > It really isn't a fair comparison. MPI seems to behave very differently
> > > when a machine is saturated. It's documented as changing its behaviour
> > > as it tries to avoid the worst consequences of saturation.
> > >
> > > Curiously, the results on the 2-socket machine were not as bad as I
> > > feared when the HT configuration is running with twice the number of
> > > threads as there are CPUs
> > >
> > > Amean bt 771.15 ( 0.00%) 1086.74 * -40.93%*
> > > Amean cg 445.92 ( 0.00%) 543.41 * -21.86%*
> > > Amean ep 70.01 ( 0.00%) 96.29 * -37.53%*
> > > Amean is 16.75 ( 0.00%) 21.19 * -26.51%*
> > > Amean lu 882.84 ( 0.00%) 595.14 * 32.59%*
> > > Amean mg 84.10 ( 0.00%) 80.02 * 4.84%*
> > > Amean sp 1353.88 ( 0.00%) 1384.10 * -2.23%*
> >
> > Yeah, so what I wanted to suggest is a parallel numeric throughput test
> > with few inter-process data dependencies, and see whether HT actually
> > improves total throughput versus the no-HT case.
> >
> > No over-saturation - but exactly as many threads as logical CPUs.
> >
> > I.e. with 20 physical cores and 40 logical CPUs the numbers to compare
> > would be a 'nosmt' benchmark running 20 threads, versus a SMT test
> > running 40 threads.
> >
> > I.e. how much does SMT improve total throughput when the workload's
> > parallelism is tuned to utilize 100% of the available CPUs?
> >
> > Does this make sense?
> >
>
> Yes. Here is the comparison.
>
> Amean bt 678.75 ( 0.00%) 789.13 * -16.26%*
> Amean cg 261.22 ( 0.00%) 428.82 * -64.16%*
> Amean ep 55.36 ( 0.00%) 84.41 * -52.48%*
> Amean is 13.25 ( 0.00%) 17.82 * -34.47%*
> Amean lu 1065.08 ( 0.00%) 1090.44 ( -2.38%)
> Amean mg 89.96 ( 0.00%) 84.28 * 6.31%*
> Amean sp 1579.52 ( 0.00%) 1506.16 * 4.64%*
> Amean ua 611.87 ( 0.00%) 663.26 * -8.40%*
>
> This is the socket machine and with HT On, there are 80 logical CPUs
> versus HT Off with 40 logical CPUs.
That's very interesting - so for most workloads HyperThreading is a
massive loss, and for 'mg' and 'sp' it's a 5-6% win?
I'm wondering how much of say the 'cg' workload's -64% loss could be task
placement inefficiency - or are these all probable effects of 80 threads
trying to use too many cache and memory resources and thus utilizing it
all way too inefficiently?
Are these relatively simple numeric workloads, with not much scheduling
and good overall pinning of tasks, or is it more complex than that?
Also, the takeaway appears to be: by using HT there's a potential
advantage of +6% on the benefit side, but a potential -50%+ performance
hit on the risk side?
I believe these results also *strongly* support a much stricter task
placement policy in up to 50% saturation of SMT systems - it's almost
always going to be a win for workloads that are actually trying to fill
in some useful role.
Thanks,
Ingo
* Aubrey Li <[email protected]> wrote:
> I have the same environment setup above, for nosmt cases, I used
> /sys interface Thomas mentioned, below is the result:
>
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 1/1 1.987( 1.97%) 2.043( 1.76%) -2.84% 1.985( 1.70%) 0.12%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 2/2 2.074( 1.16%) 2.057( 2.09%) 0.81% 2.072( 0.77%) 0.10%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 4/4 2.140( 0.00%) 2.138( 0.49%) 0.09% 2.137( 0.89%) 0.12%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 8/8 2.140( 0.00%) 2.144( 0.53%) -0.17% 2.140( 0.00%) 0.00%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 16/16 2.361( 2.99%) 2.369( 2.65%) -0.30% 2.406( 2.53%) -1.87%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 32/32 5.032( 8.68%) 3.485( 0.49%) 30.76% 6.002(27.21%) -19.27%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 64/64 7.577(34.35%) 3.972(23.18%) 47.57% 18.235(14.14%) -140.68%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 128/128 24.639(14.28%) 27.440( 8.24%) -11.37% 34.746( 6.92%) -41.02%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 256/256 38.797( 8.59%) 44.067(16.20%) -13.58% 42.536( 7.57%) -9.64%
What do these numbers mean? Are these latencies, i.e. lower is better?
Thanks,
Ingo
On Sat, Apr 27, 2019 at 5:17 PM Ingo Molnar <[email protected]> wrote:
>
>
> * Aubrey Li <[email protected]> wrote:
>
> > I have the same environment setup above, for nosmt cases, I used
> > /sys interface Thomas mentioned, below is the result:
> >
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 1/1 1.987( 1.97%) 2.043( 1.76%) -2.84% 1.985( 1.70%) 0.12%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 2/2 2.074( 1.16%) 2.057( 2.09%) 0.81% 2.072( 0.77%) 0.10%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 4/4 2.140( 0.00%) 2.138( 0.49%) 0.09% 2.137( 0.89%) 0.12%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 8/8 2.140( 0.00%) 2.144( 0.53%) -0.17% 2.140( 0.00%) 0.00%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 16/16 2.361( 2.99%) 2.369( 2.65%) -0.30% 2.406( 2.53%) -1.87%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 32/32 5.032( 8.68%) 3.485( 0.49%) 30.76% 6.002(27.21%) -19.27%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 64/64 7.577(34.35%) 3.972(23.18%) 47.57% 18.235(14.14%) -140.68%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 128/128 24.639(14.28%) 27.440( 8.24%) -11.37% 34.746( 6.92%) -41.02%
> > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > 256/256 38.797( 8.59%) 44.067(16.20%) -13.58% 42.536( 7.57%) -9.64%
>
> What do these numbers mean? Are these latencies, i.e. lower is better?
Yeah, like above setup, I run sysbench(Non-AVX task, NA) and gemmbench
(AVX512 task, AVX) in different level utilizatoin. The machine has 104 CPUs, so
nosmt has 52 CPUs. These numbers are 95th percentile latency of sysbench,
lower is better.
Thanks,
-Aubrey
* Aubrey Li <[email protected]> wrote:
> On Sat, Apr 27, 2019 at 5:17 PM Ingo Molnar <[email protected]> wrote:
> >
> >
> > * Aubrey Li <[email protected]> wrote:
> >
> > > I have the same environment setup above, for nosmt cases, I used
> > > /sys interface Thomas mentioned, below is the result:
> > >
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 1/1 1.987( 1.97%) 2.043( 1.76%) -2.84% 1.985( 1.70%) 0.12%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 2/2 2.074( 1.16%) 2.057( 2.09%) 0.81% 2.072( 0.77%) 0.10%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 4/4 2.140( 0.00%) 2.138( 0.49%) 0.09% 2.137( 0.89%) 0.12%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 8/8 2.140( 0.00%) 2.144( 0.53%) -0.17% 2.140( 0.00%) 0.00%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 16/16 2.361( 2.99%) 2.369( 2.65%) -0.30% 2.406( 2.53%) -1.87%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 32/32 5.032( 8.68%) 3.485( 0.49%) 30.76% 6.002(27.21%) -19.27%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 64/64 7.577(34.35%) 3.972(23.18%) 47.57% 18.235(14.14%) -140.68%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 128/128 24.639(14.28%) 27.440( 8.24%) -11.37% 34.746( 6.92%) -41.02%
> > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > 256/256 38.797( 8.59%) 44.067(16.20%) -13.58% 42.536( 7.57%) -9.64%
> >
> > What do these numbers mean? Are these latencies, i.e. lower is better?
>
> Yeah, like above setup, I run sysbench(Non-AVX task, NA) and gemmbench
> (AVX512 task, AVX) in different level utilizatoin. The machine has 104 CPUs, so
> nosmt has 52 CPUs. These numbers are 95th percentile latency of sysbench,
> lower is better.
But what we are really interested in are throughput numbers under these
three kernel variants, right?
Thanks,
Ingo
On Sat, Apr 27, 2019 at 10:21 PM Ingo Molnar <[email protected]> wrote:
>
> * Aubrey Li <[email protected]> wrote:
>
> > On Sat, Apr 27, 2019 at 5:17 PM Ingo Molnar <[email protected]> wrote:
> > >
> > >
> > > * Aubrey Li <[email protected]> wrote:
> > >
> > > > I have the same environment setup above, for nosmt cases, I used
> > > > /sys interface Thomas mentioned, below is the result:
> > > >
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 1/1 1.987( 1.97%) 2.043( 1.76%) -2.84% 1.985( 1.70%) 0.12%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 2/2 2.074( 1.16%) 2.057( 2.09%) 0.81% 2.072( 0.77%) 0.10%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 4/4 2.140( 0.00%) 2.138( 0.49%) 0.09% 2.137( 0.89%) 0.12%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 8/8 2.140( 0.00%) 2.144( 0.53%) -0.17% 2.140( 0.00%) 0.00%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 16/16 2.361( 2.99%) 2.369( 2.65%) -0.30% 2.406( 2.53%) -1.87%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 32/32 5.032( 8.68%) 3.485( 0.49%) 30.76% 6.002(27.21%) -19.27%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 64/64 7.577(34.35%) 3.972(23.18%) 47.57% 18.235(14.14%) -140.68%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 128/128 24.639(14.28%) 27.440( 8.24%) -11.37% 34.746( 6.92%) -41.02%
> > > > NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> > > > 256/256 38.797( 8.59%) 44.067(16.20%) -13.58% 42.536( 7.57%) -9.64%
> > >
> > > What do these numbers mean? Are these latencies, i.e. lower is better?
> >
> > Yeah, like above setup, I run sysbench(Non-AVX task, NA) and gemmbench
> > (AVX512 task, AVX) in different level utilizatoin. The machine has 104 CPUs, so
> > nosmt has 52 CPUs. These numbers are 95th percentile latency of sysbench,
> > lower is better.
>
> But what we are really interested in are throughput numbers under these
> three kernel variants, right?
>
These are sysbench events per second number, higher is better.
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
1/1 508.5( 0.2%) 504.7( 1.1%) -0.8% 509.0( 0.2%) 0.1%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
2/2 1000.2( 1.4%) 1004.1( 1.6%) 0.4% 997.6( 1.2%) -0.3%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
4/4 1912.1( 1.0%) 1904.2( 1.1%) -0.4% 1914.9( 1.3%) 0.1%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
8/8 3753.5( 0.3%) 3748.2( 0.3%) -0.1% 3751.3( 0.4%) -0.1%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
16/16 7139.3( 2.4%) 7137.9( 1.8%) -0.0% 7049.2( 2.4%) -1.3%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
32/32 10899.0( 4.2%) 10780.3( 4.4%) -1.1% 10339.2( 9.6%) -5.1%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
64/64 15086.1(11.5%) 14262.0( 8.2%) -5.5% 11168.7(22.2%) -26.0%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
128/128 15371.9(22.0%) 14675.8(14.4%) -4.5% 10963.9(18.5%) -28.7%
NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
256/256 15990.8(22.0%) 12227.9(10.3%) -23.5% 10469.9(19.6%) -34.5%
* Aubrey Li <[email protected]> wrote:
> > But what we are really interested in are throughput numbers under
> > these three kernel variants, right?
>
> These are sysbench events per second number, higher is better.
>
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 1/1 508.5( 0.2%) 504.7( 1.1%) -0.8% 509.0( 0.2%) 0.1%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 2/2 1000.2( 1.4%) 1004.1( 1.6%) 0.4% 997.6( 1.2%) -0.3%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 4/4 1912.1( 1.0%) 1904.2( 1.1%) -0.4% 1914.9( 1.3%) 0.1%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 8/8 3753.5( 0.3%) 3748.2( 0.3%) -0.1% 3751.3( 0.4%) -0.1%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 16/16 7139.3( 2.4%) 7137.9( 1.8%) -0.0% 7049.2( 2.4%) -1.3%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 32/32 10899.0( 4.2%) 10780.3( 4.4%) -1.1% 10339.2( 9.6%) -5.1%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 64/64 15086.1(11.5%) 14262.0( 8.2%) -5.5% 11168.7(22.2%) -26.0%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 128/128 15371.9(22.0%) 14675.8(14.4%) -4.5% 10963.9(18.5%) -28.7%
> NA/AVX baseline(std%) coresched(std%) +/- nosmt(std%) +/-
> 256/256 15990.8(22.0%) 12227.9(10.3%) -23.5% 10469.9(19.6%) -34.5%
So because I'm a big fan of presenting data in a readable fashion, here
are your results, tabulated:
#
# Sysbench throughput comparison of 3 different kernels at different
# load levels, higher numbers are better:
#
.--------------------------------------|----------------------------------------------------------------.
| NA/AVX vanilla-SMT [stddev%] |coresched-SMT [stddev%] +/- | no-SMT [stddev%] +/- |
|--------------------------------------|----------------------------------------------------------------|
| 1/1 508.5 [ 0.2% ] | 504.7 [ 1.1% ] 0.8% | 509.0 [ 0.2% ] 0.1% |
| 2/2 1000.2 [ 1.4% ] | 1004.1 [ 1.6% ] 0.4% | 997.6 [ 1.2% ] 0.3% |
| 4/4 1912.1 [ 1.0% ] | 1904.2 [ 1.1% ] 0.4% | 1914.9 [ 1.3% ] 0.1% |
| 8/8 3753.5 [ 0.3% ] | 3748.2 [ 0.3% ] 0.1% | 3751.3 [ 0.4% ] 0.1% |
| 16/16 7139.3 [ 2.4% ] | 7137.9 [ 1.8% ] 0.0% | 7049.2 [ 2.4% ] 1.3% |
| 32/32 10899.0 [ 4.2% ] | 10780.3 [ 4.4% ] -1.1% | 10339.2 [ 9.6% ] -5.1% |
| 64/64 15086.1 [ 11.5% ] | 14262.0 [ 8.2% ] -5.5% | 11168.7 [ 22.2% ] -26.0% |
| 128/128 15371.9 [ 22.0% ] | 14675.8 [ 14.4% ] -4.5% | 10963.9 [ 18.5% ] -28.7% |
| 256/256 15990.8 [ 22.0% ] | 12227.9 [ 10.3% ] -23.5% | 10469.9 [ 19.6% ] -34.5% |
'--------------------------------------|----------------------------------------------------------------'
One major thing that sticks out is that if we compare the stddev numbers
to the +/- comparisons then it's pretty clear that the benchmarks are
very noisy: in all but the last row stddev is actually higher than the
measured effect.
So what does 'stddev' mean here, exactly? The stddev of multipe runs,
i.e. measured run-to-run variance? Or is it some internal metric of the
benchmark?
Thanks,
Ingo
On Sun, Apr 28, 2019 at 5:33 PM Ingo Molnar <[email protected]> wrote:
> So because I'm a big fan of presenting data in a readable fashion, here
> are your results, tabulated:
I thought I tried my best to make it readable, but this one looks much better,
thanks, ;-)
>
> #
> # Sysbench throughput comparison of 3 different kernels at different
> # load levels, higher numbers are better:
> #
>
> .--------------------------------------|----------------------------------------------------------------.
> | NA/AVX vanilla-SMT [stddev%] |coresched-SMT [stddev%] +/- | no-SMT [stddev%] +/- |
> |--------------------------------------|----------------------------------------------------------------|
> | 1/1 508.5 [ 0.2% ] | 504.7 [ 1.1% ] 0.8% | 509.0 [ 0.2% ] 0.1% |
> | 2/2 1000.2 [ 1.4% ] | 1004.1 [ 1.6% ] 0.4% | 997.6 [ 1.2% ] 0.3% |
> | 4/4 1912.1 [ 1.0% ] | 1904.2 [ 1.1% ] 0.4% | 1914.9 [ 1.3% ] 0.1% |
> | 8/8 3753.5 [ 0.3% ] | 3748.2 [ 0.3% ] 0.1% | 3751.3 [ 0.4% ] 0.1% |
> | 16/16 7139.3 [ 2.4% ] | 7137.9 [ 1.8% ] 0.0% | 7049.2 [ 2.4% ] 1.3% |
> | 32/32 10899.0 [ 4.2% ] | 10780.3 [ 4.4% ] -1.1% | 10339.2 [ 9.6% ] -5.1% |
> | 64/64 15086.1 [ 11.5% ] | 14262.0 [ 8.2% ] -5.5% | 11168.7 [ 22.2% ] -26.0% |
> | 128/128 15371.9 [ 22.0% ] | 14675.8 [ 14.4% ] -4.5% | 10963.9 [ 18.5% ] -28.7% |
> | 256/256 15990.8 [ 22.0% ] | 12227.9 [ 10.3% ] -23.5% | 10469.9 [ 19.6% ] -34.5% |
> '--------------------------------------|----------------------------------------------------------------'
>
> One major thing that sticks out is that if we compare the stddev numbers
> to the +/- comparisons then it's pretty clear that the benchmarks are
> very noisy: in all but the last row stddev is actually higher than the
> measured effect.
>
> So what does 'stddev' mean here, exactly? The stddev of multipe runs,
> i.e. measured run-to-run variance? Or is it some internal metric of the
> benchmark?
>
The benchmark periodically reports intermediate statistics in one second,
the raw log looks like below:
[ 11s ] thds: 256 eps: 14346.72 lat (ms,95%): 44.17
[ 12s ] thds: 256 eps: 14328.45 lat (ms,95%): 44.17
[ 13s ] thds: 256 eps: 13773.06 lat (ms,95%): 43.39
[ 14s ] thds: 256 eps: 13752.31 lat (ms,95%): 43.39
[ 15s ] thds: 256 eps: 15362.79 lat (ms,95%): 43.39
[ 16s ] thds: 256 eps: 26580.65 lat (ms,95%): 35.59
[ 17s ] thds: 256 eps: 15011.78 lat (ms,95%): 36.89
[ 18s ] thds: 256 eps: 15025.78 lat (ms,95%): 39.65
[ 19s ] thds: 256 eps: 15350.87 lat (ms,95%): 39.65
[ 20s ] thds: 256 eps: 15491.70 lat (ms,95%): 36.89
I have a python script to parse eps(events per second) and lat(latency)
out, and compute the average and stddev. (And I can draw a curve locally).
It's noisy indeed when tasks number is greater than the CPU number.
It's probably caused by high frequent load balance and context switch.
Do you have any suggestions? Or any other information I can provide?
Thanks,
-Aubrey
* Aubrey Li <[email protected]> wrote:
> On Sun, Apr 28, 2019 at 5:33 PM Ingo Molnar <[email protected]> wrote:
> > So because I'm a big fan of presenting data in a readable fashion, here
> > are your results, tabulated:
>
> I thought I tried my best to make it readable, but this one looks much better,
> thanks, ;-)
> >
> > #
> > # Sysbench throughput comparison of 3 different kernels at different
> > # load levels, higher numbers are better:
> > #
> >
> > .--------------------------------------|----------------------------------------------------------------.
> > | NA/AVX vanilla-SMT [stddev%] |coresched-SMT [stddev%] +/- | no-SMT [stddev%] +/- |
> > |--------------------------------------|----------------------------------------------------------------|
> > | 1/1 508.5 [ 0.2% ] | 504.7 [ 1.1% ] 0.8% | 509.0 [ 0.2% ] 0.1% |
> > | 2/2 1000.2 [ 1.4% ] | 1004.1 [ 1.6% ] 0.4% | 997.6 [ 1.2% ] 0.3% |
> > | 4/4 1912.1 [ 1.0% ] | 1904.2 [ 1.1% ] 0.4% | 1914.9 [ 1.3% ] 0.1% |
> > | 8/8 3753.5 [ 0.3% ] | 3748.2 [ 0.3% ] 0.1% | 3751.3 [ 0.4% ] 0.1% |
> > | 16/16 7139.3 [ 2.4% ] | 7137.9 [ 1.8% ] 0.0% | 7049.2 [ 2.4% ] 1.3% |
> > | 32/32 10899.0 [ 4.2% ] | 10780.3 [ 4.4% ] -1.1% | 10339.2 [ 9.6% ] -5.1% |
> > | 64/64 15086.1 [ 11.5% ] | 14262.0 [ 8.2% ] -5.5% | 11168.7 [ 22.2% ] -26.0% |
> > | 128/128 15371.9 [ 22.0% ] | 14675.8 [ 14.4% ] -4.5% | 10963.9 [ 18.5% ] -28.7% |
> > | 256/256 15990.8 [ 22.0% ] | 12227.9 [ 10.3% ] -23.5% | 10469.9 [ 19.6% ] -34.5% |
> > '--------------------------------------|----------------------------------------------------------------'
> >
> > One major thing that sticks out is that if we compare the stddev numbers
> > to the +/- comparisons then it's pretty clear that the benchmarks are
> > very noisy: in all but the last row stddev is actually higher than the
> > measured effect.
> >
> > So what does 'stddev' mean here, exactly? The stddev of multipe runs,
> > i.e. measured run-to-run variance? Or is it some internal metric of the
> > benchmark?
> >
>
> The benchmark periodically reports intermediate statistics in one second,
> the raw log looks like below:
> [ 11s ] thds: 256 eps: 14346.72 lat (ms,95%): 44.17
> [ 12s ] thds: 256 eps: 14328.45 lat (ms,95%): 44.17
> [ 13s ] thds: 256 eps: 13773.06 lat (ms,95%): 43.39
> [ 14s ] thds: 256 eps: 13752.31 lat (ms,95%): 43.39
> [ 15s ] thds: 256 eps: 15362.79 lat (ms,95%): 43.39
> [ 16s ] thds: 256 eps: 26580.65 lat (ms,95%): 35.59
> [ 17s ] thds: 256 eps: 15011.78 lat (ms,95%): 36.89
> [ 18s ] thds: 256 eps: 15025.78 lat (ms,95%): 39.65
> [ 19s ] thds: 256 eps: 15350.87 lat (ms,95%): 39.65
> [ 20s ] thds: 256 eps: 15491.70 lat (ms,95%): 36.89
>
> I have a python script to parse eps(events per second) and lat(latency)
> out, and compute the average and stddev. (And I can draw a curve locally).
>
> It's noisy indeed when tasks number is greater than the CPU number.
> It's probably caused by high frequent load balance and context switch.
Ok, so it's basically an internal workload noise metric, it doesn't
represent the run-to-run noise.
So it's the real stddev of the workload - but we don't know whether the
measured performance figure is exactly in the middle of the runtime
probability distribution.
> Do you have any suggestions? Or any other information I can provide?
Yeah, so we don't just want to know the "standard deviation" of the
measured throughput values, but also the "standard error of the mean".
I suspect it's pretty low, below 1% for all rows?
Thanks,
Ingo
On 2019/4/28 20:17, Ingo Molnar wrote:
>
> * Aubrey Li <[email protected]> wrote:
>
>> On Sun, Apr 28, 2019 at 5:33 PM Ingo Molnar <[email protected]> wrote:
>>> So because I'm a big fan of presenting data in a readable fashion, here
>>> are your results, tabulated:
>>
>> I thought I tried my best to make it readable, but this one looks much better,
>> thanks, ;-)
>>>
>>> #
>>> # Sysbench throughput comparison of 3 different kernels at different
>>> # load levels, higher numbers are better:
>>> #
>>>
>>> .--------------------------------------|----------------------------------------------------------------.
>>> | NA/AVX vanilla-SMT [stddev%] |coresched-SMT [stddev%] +/- | no-SMT [stddev%] +/- |
>>> |--------------------------------------|----------------------------------------------------------------|
>>> | 1/1 508.5 [ 0.2% ] | 504.7 [ 1.1% ] 0.8% | 509.0 [ 0.2% ] 0.1% |
>>> | 2/2 1000.2 [ 1.4% ] | 1004.1 [ 1.6% ] 0.4% | 997.6 [ 1.2% ] 0.3% |
>>> | 4/4 1912.1 [ 1.0% ] | 1904.2 [ 1.1% ] 0.4% | 1914.9 [ 1.3% ] 0.1% |
>>> | 8/8 3753.5 [ 0.3% ] | 3748.2 [ 0.3% ] 0.1% | 3751.3 [ 0.4% ] 0.1% |
>>> | 16/16 7139.3 [ 2.4% ] | 7137.9 [ 1.8% ] 0.0% | 7049.2 [ 2.4% ] 1.3% |
>>> | 32/32 10899.0 [ 4.2% ] | 10780.3 [ 4.4% ] -1.1% | 10339.2 [ 9.6% ] -5.1% |
>>> | 64/64 15086.1 [ 11.5% ] | 14262.0 [ 8.2% ] -5.5% | 11168.7 [ 22.2% ] -26.0% |
>>> | 128/128 15371.9 [ 22.0% ] | 14675.8 [ 14.4% ] -4.5% | 10963.9 [ 18.5% ] -28.7% |
>>> | 256/256 15990.8 [ 22.0% ] | 12227.9 [ 10.3% ] -23.5% | 10469.9 [ 19.6% ] -34.5% |
>>> '--------------------------------------|----------------------------------------------------------------'
>>>
>>> One major thing that sticks out is that if we compare the stddev numbers
>>> to the +/- comparisons then it's pretty clear that the benchmarks are
>>> very noisy: in all but the last row stddev is actually higher than the
>>> measured effect.
>>>
>>> So what does 'stddev' mean here, exactly? The stddev of multipe runs,
>>> i.e. measured run-to-run variance? Or is it some internal metric of the
>>> benchmark?
>>>
>>
>> The benchmark periodically reports intermediate statistics in one second,
>> the raw log looks like below:
>> [ 11s ] thds: 256 eps: 14346.72 lat (ms,95%): 44.17
>> [ 12s ] thds: 256 eps: 14328.45 lat (ms,95%): 44.17
>> [ 13s ] thds: 256 eps: 13773.06 lat (ms,95%): 43.39
>> [ 14s ] thds: 256 eps: 13752.31 lat (ms,95%): 43.39
>> [ 15s ] thds: 256 eps: 15362.79 lat (ms,95%): 43.39
>> [ 16s ] thds: 256 eps: 26580.65 lat (ms,95%): 35.59
>> [ 17s ] thds: 256 eps: 15011.78 lat (ms,95%): 36.89
>> [ 18s ] thds: 256 eps: 15025.78 lat (ms,95%): 39.65
>> [ 19s ] thds: 256 eps: 15350.87 lat (ms,95%): 39.65
>> [ 20s ] thds: 256 eps: 15491.70 lat (ms,95%): 36.89
>>
>> I have a python script to parse eps(events per second) and lat(latency)
>> out, and compute the average and stddev. (And I can draw a curve locally).
>>
>> It's noisy indeed when tasks number is greater than the CPU number.
>> It's probably caused by high frequent load balance and context switch.
>
> Ok, so it's basically an internal workload noise metric, it doesn't
> represent the run-to-run noise.
>
> So it's the real stddev of the workload - but we don't know whether the
> measured performance figure is exactly in the middle of the runtime
> probability distribution.
>
>> Do you have any suggestions? Or any other information I can provide?
>
> Yeah, so we don't just want to know the "standard deviation" of the
> measured throughput values, but also the "standard error of the mean".
>
> I suspect it's pretty low, below 1% for all rows?
Hope my this mail box works for this...
.-------------------------------------------------------------------------------------------------------------.
|NA/AVX vanilla-SMT [std% / sem%] | coresched-SMT [std% / sem%] +/- | no-SMT [std% / sem%] +/- |
|-------------------------------------------------------------------------------------------------------------|
| 1/1 508.5 [ 0.2%/ 0.0%] | 504.7 [ 1.1%/ 0.1%] -0.8%| 509.0 [ 0.2%/ 0.0%] 0.1% |
| 2/2 1000.2 [ 1.4%/ 0.1%] | 1004.1 [ 1.6%/ 0.2%] 0.4%| 997.6 [ 1.2%/ 0.1%] -0.3% |
| 4/4 1912.1 [ 1.0%/ 0.1%] | 1904.2 [ 1.1%/ 0.1%] -0.4%| 1914.9 [ 1.3%/ 0.1%] 0.1% |
| 8/8 3753.5 [ 0.3%/ 0.0%] | 3748.2 [ 0.3%/ 0.0%] -0.1%| 3751.3 [ 0.4%/ 0.0%] -0.1% |
| 16/16 7139.3 [ 2.4%/ 0.2%] | 7137.9 [ 1.8%/ 0.2%] -0.0%| 7049.2 [ 2.4%/ 0.2%] -1.3% |
| 32/32 10899.0 [ 4.2%/ 0.4%] | 10780.3 [ 4.4%/ 0.4%] -1.1%| 10339.2 [ 9.6%/ 0.9%] -5.1% |
| 64/64 15086.1 [11.5%/ 1.2%] | 14262.0 [ 8.2%/ 0.8%] -5.5%| 11168.7 [22.2%/ 1.7%] -26.0% |
|128/128 15371.9 [22.0%/ 2.2%] | 14675.8 [14.4%/ 1.4%] -4.5%| 10963.9 [18.5%/ 1.4%] -28.7% |
|256/256 15990.8 [22.0%/ 2.2%] | 12227.9 [10.3%/ 1.0%] -23.5%| 10469.9 [19.6%/ 1.7%] -34.5% |
'-------------------------------------------------------------------------------------------------------------'
Thanks,
-Aubrey
On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
> +/*
> + * l(a,b)
> + * le(a,b) := !l(b,a)
> + * g(a,b) := l(b,a)
> + * ge(a,b) := !l(a,b)
> + */
> +
> +/* real prio, less is less */
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
> +{
> + u64 vruntime;
> +
> + int pa = __task_prio(a), pb = __task_prio(b);
> +
> + if (-pa < -pb)
> + return true;
> +
> + if (-pb < -pa)
> + return false;
> +
> + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> + return !dl_time_before(a->dl.deadline, b->dl.deadline);
> +
> + vruntime = b->se.vruntime;
> + if (core_cmp) {
> + vruntime -= task_cfs_rq(b)->min_vruntime;
> + vruntime += task_cfs_rq(a)->min_vruntime;
> + }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) <= 0);
> +
> + return false;
> +}
This unfortunately still doesn't work.
Consider the following task layout on two sibling CPUs(cpu0 and cpu1):
rq0.cfs_rq rq1.cfs_rq
| |
se_bash se_hog
se_hog is the sched_entity for a cpu intensive task and se_bash is the
sched_entity for bash.
There are two problems:
1 SCHED_DEBIT
when user execute some commands through bash, say ls, bash will fork.
The newly forked ls' vruntime is set in the future due to SCHED_DEBIT.
This made 'ls' lose in __prio_less() when compared with hog, whose
vruntime may very likely be the same as its cfs_rq's min_vruntime.
This is OK since we do not want forked process to starve already running
ones. The problem is, since hog keeps running, its vruntime will always
sync with its cfs_rq's min_vruntime. OTOH, 'ls' can not run, its
cfs_rq's min_vruntime doesn't proceed, making 'ls' always lose to hog.
2 who schedules, who wins
so I disabled SCHED_DEBIT, for testing's purpose. When cpu0 schedules,
ls could win where both sched_entity's vruntime is the same as their
cfs_rqs' min_vruntime. So does hog: when cpu1 schedules, hog can preempt
ls in the same way. The end result is, interactive task can lose to cpu
intensive task and ls can feel "dead".
I haven't figured out a way to solve this yet. A core wide cfs_rq's
min_vruntime can probably solve this. Your suggestions are appreciated.
On Tue, Apr 23, 2019 at 06:45:27PM +0000, Vineeth Remanan Pillai wrote:
> >> - Processes with different tags can still share the core
>
> > I may have missed something... Could you explain this statement?
>
> > This, to me, is the whole point of the patch series. If it's not
> > doing this then ... what?
>
> What I meant was, the patch needs some more work to be accurate.
> There are some race conditions where the core violation can still
> happen. In our testing, we saw around 1 to 5% of the time being
> shared with incompatible processes. One example of this happening
> is as follows(let cpu 0 and 1 be siblings):
> - cpu 0 selects a process with a cookie
> - cpu 1 selects a higher priority process without cookie
> - Selection process restarts for cpu 0 and it might select a
> process with cookie but with lesser priority.
> - Since it is lesser priority, the logic in pick_next_task
> doesn't compare again for the cookie(trusts pick_task) and
> proceeds.
>
> This is one of the scenarios that we saw from traces, but there
> might be other race conditions as well. Fix seems a little
> involved and We are working on that.
This is what I have used to make sure no two unmatched tasks being
scheduled on the same core: (on top of v1, I thinks it's easier to just
show the diff instead of commenting on various places of the patches :-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb24a0141e57..0cdb1c6a00a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,6 +186,10 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
*/
match = idle_sched_class.pick_task(rq);
+ /* TODO: untagged tasks are not in the core tree */
+ if (!cookie)
+ goto out;
+
while (node) {
node_task = container_of(node, struct task_struct, core_node);
@@ -199,6 +203,7 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
}
}
+out:
return match;
}
@@ -3634,6 +3639,8 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
}
// XXX fairness/fwd progress conditions
+// when max is unset, return class_pick;
+// when max is set, return cookie_pick unless class_pick has higher priority.
static struct task_struct *
pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
{
@@ -3652,7 +3659,19 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
}
class_pick = class->pick_task(rq);
- if (!cookie)
+ /*
+ * we can only return class_pick here when max is not set.
+ *
+ * when max is set and cookie is 0, we still have to check if
+ * class_pick's cookie matches with max, or we can end up picking
+ * an unmacthed task. e.g. max is untagged and class_pick here
+ * is tagged.
+ */
+ if (!cookie && !max)
+ return class_pick;
+
+ /* in case class_pick matches with max, no need to check priority */
+ if (class_pick && cookie_match(class_pick, max))
return class_pick;
cookie_pick = sched_core_find(rq, cookie);
@@ -3663,8 +3682,11 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
* If class > max && class > cookie, it is the highest priority task on
* the core (so far) and it must be selected, otherwise we must go with
* the cookie pick in order to satisfy the constraint.
+ *
+ * class_pick and cookie_pick are on the same cpu so use cpu_prio_less()
+ * max and class_pick are on different cpus so use core_prio_less()
*/
- if (cpu_prio_less(cookie_pick, class_pick) && cpu_prio_less(max, class_pick))
+ if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
return class_pick;
return cookie_pick;
@@ -3731,8 +3753,17 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq_i->core_pick = NULL;
- if (i != cpu)
+ if (i != cpu) {
update_rq_clock(rq_i);
+ /*
+ * we are going to pick tasks for both cpus, if our
+ * sibling is idle and we have core_cookie set, now
+ * is the time to clear/reset it so that we can do
+ * an unconstained pick.
+ */
+ if (is_idle_task(rq_i->curr) && rq_i->core->core_cookie)
+ rq_i->core->core_cookie = 0;
+ }
}
/*
@@ -3794,20 +3825,42 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* NOTE: this is a linear max-filter and is thus bounded
* in execution time.
+ *
+ * The fact that pick_task() returns p with a different
+ * cookie means p has higher priority and we need to
+ * replace max with p.
*/
- if (!max || core_prio_less(max, p)) {
+ if (!max || !cookie_match(max, p)) {
struct task_struct *old_max = max;
rq->core->core_cookie = p->core_cookie;
max = p;
trace_printk("max: %s/%d %lx\n", max->comm, max->pid, max->core_cookie);
- if (old_max && !cookie_match(old_max, p)) {
+ if (old_max) {
for_each_cpu(j, smt_mask) {
if (j == i)
continue;
cpu_rq(j)->core_pick = NULL;
+
+ /*
+ * if max is untagged, then core_cookie
+ * is zero and siblig can do a wrongly
+ * unconstained pick. avoid that by doing
+ * pick directly here. since there is no
+ * untagged tasks in core tree, just
+ * use idle for our sibling.
+ * TODO: sibling may pick an untagged task.
+ */
+ if (max->core_cookie)
+ cpu_rq(j)->core_pick = NULL;
+ else {
+ cpu_rq(j)->core_pick = idle_sched_class.pick_task(cpu_rq(j));
+ occ = 1;
+ goto out;
+ }
+
}
occ = 1;
goto again;
@@ -3817,6 +3870,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
next_class:;
}
+out:
rq->core->core_pick_seq = rq->core->core_task_seq;
/*
@@ -3834,6 +3888,17 @@ next_class:;
rq_i->core_pick->core_occupation = occ;
+ /* make sure we didn't break L1TF */
+ if (!is_idle_task(rq_i->core_pick) &&
+ rq_i->core_pick->core_cookie != rq_i->core->core_cookie) {
+ trace_printk("cpu%d: cookie mismatch. %s/%d/0x%lx/0x%lx\n",
+ rq_i->cpu, rq_i->core_pick->comm,
+ rq_i->core_pick->pid,
+ rq_i->core_pick->core_cookie,
+ rq_i->core->core_cookie);
+ WARN_ON_ONCE(1);
+ }
+
if (i == cpu)
continue;
On Tue, Apr 23, 2019 at 04:18:14PM +0000, Vineeth Remanan Pillai wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c055bad249a9..45d86b862750 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4132,7 +4132,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> * Avoid running the skip buddy, if running something else can
> * be done without getting too unfair.
> */
> - if (cfs_rq->skip == se) {
> + if (cfs_rq->skip && cfs_rq->skip == se) {
> struct sched_entity *second;
>
> if (se == curr) {
> @@ -4150,13 +4150,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> /*
> * Prefer last buddy, try to return the CPU to a preempted task.
> */
> - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
> + if (left && cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
> se = cfs_rq->last;
>
> /*
> * Someone really wants this to run. If it's not unfair, run it.
> */
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
> + if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
> se = cfs_rq->next;
>
> clear_buddies(cfs_rq, se);
> @@ -6937,6 +6937,37 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
> set_last_buddy(se);
> }
>
> +static struct task_struct *
> +pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = &rq->cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);
> +
> + if (!(se || curr))
> + return NULL;
I think you have already avoided the null pointer access bug in
the above pick_next_entity() by doing multiple checks for null pointers:
cfs_rq->skip and left.
An alternative way to fix the null pointer access bug: if curr is the
only runnable entity in this cfs_rq, there is no need to call
pick_next_entity(cfs_rq, NULL) since the rbtree is empty. This way
pick_next_entity() doesn't need change. something like:
do {
struct sched_entity *curr = cfs_rq->curr;
if (curr && curr->on_rq && cfs_rq->nr_running == 1)
se = NULL;
else
se = pick_next_entity(cfs_rq, NULL);
/* the following code doesn't change */
> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +
> + if (!se || entity_before(curr, se))
> + se = curr;
> + }
> +
> + cfs_rq = group_cfs_rq(se);
> + } while (cfs_rq);
> +
> + return task_of(se);
> +}
There is another problem I'm thinking: suppose cpu0 and cpu1 are
siblings and task A, B are runnable on cpu0 and curr is A. When cpu1
schedules, pick_task_fair() will also be called for cpu0 to decide
which CPU's task to preempt the other.
When pick_task_fair() is called for cpu0 due to cpu1 schedules:
curr(i.e. A) may only run a few nanoseconds, and thus can have a higher
vruntime than B. So we chose B to fight with task chosen from cpu1. If
B wins, we will schedule B on cpu0. If B loses, we will probably
schedule idle on cpu0(if cookie unmatch). Either case, A didn't get its
share. We probably want to make sure a task at least running for some
time before being considered to be preempted.
* Li, Aubrey <[email protected]> wrote:
> > I suspect it's pretty low, below 1% for all rows?
>
> Hope my this mail box works for this...
>
> .-------------------------------------------------------------------------------------------------------------.
> |NA/AVX vanilla-SMT [std% / sem%] | coresched-SMT [std% / sem%] +/- | no-SMT [std% / sem%] +/- |
> |-------------------------------------------------------------------------------------------------------------|
> | 1/1 508.5 [ 0.2%/ 0.0%] | 504.7 [ 1.1%/ 0.1%] -0.8%| 509.0 [ 0.2%/ 0.0%] 0.1% |
> | 2/2 1000.2 [ 1.4%/ 0.1%] | 1004.1 [ 1.6%/ 0.2%] 0.4%| 997.6 [ 1.2%/ 0.1%] -0.3% |
> | 4/4 1912.1 [ 1.0%/ 0.1%] | 1904.2 [ 1.1%/ 0.1%] -0.4%| 1914.9 [ 1.3%/ 0.1%] 0.1% |
> | 8/8 3753.5 [ 0.3%/ 0.0%] | 3748.2 [ 0.3%/ 0.0%] -0.1%| 3751.3 [ 0.4%/ 0.0%] -0.1% |
> | 16/16 7139.3 [ 2.4%/ 0.2%] | 7137.9 [ 1.8%/ 0.2%] -0.0%| 7049.2 [ 2.4%/ 0.2%] -1.3% |
> | 32/32 10899.0 [ 4.2%/ 0.4%] | 10780.3 [ 4.4%/ 0.4%] -1.1%| 10339.2 [ 9.6%/ 0.9%] -5.1% |
> | 64/64 15086.1 [11.5%/ 1.2%] | 14262.0 [ 8.2%/ 0.8%] -5.5%| 11168.7 [22.2%/ 1.7%] -26.0% |
> |128/128 15371.9 [22.0%/ 2.2%] | 14675.8 [14.4%/ 1.4%] -4.5%| 10963.9 [18.5%/ 1.4%] -28.7% |
> |256/256 15990.8 [22.0%/ 2.2%] | 12227.9 [10.3%/ 1.0%] -23.5%| 10469.9 [19.6%/ 1.7%] -34.5% |
> '-------------------------------------------------------------------------------------------------------------'
Perfectly presented, thank you very much!
My final questin would be about the environment:
> Skylake server, 2 numa nodes, 104 CPUs (HT on)
Is the typical nr_running value the sum of 'NA+AVX', i.e. is it ~256
threads for the 128/128 row for example - or is it 128 parallel tasks?
I.e. showing the approximate CPU thread-load figure column would be very
useful too, where '50%' shows half-loaded, '100%' fully-loaded, '200%'
over-saturated, etc. - for each row?
Thanks,
Ingo
On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
> +/*
> + * Find left-most (aka, highest priority) task matching @cookie.
> + */
> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> +{
> + struct rb_node *node = rq->core_tree.rb_node;
> + struct task_struct *node_task, *match;
> +
> + /*
> + * The idle task always matches any cookie!
> + */
> + match = idle_sched_class.pick_task(rq);
> +
> + while (node) {
> + node_task = container_of(node, struct task_struct, core_node);
> +
> + if (node_task->core_cookie < cookie) {
> + node = node->rb_left;
Should go right here?
> + } else if (node_task->core_cookie > cookie) {
> + node = node->rb_right;
And left here?
> + } else {
> + match = node_task;
> + node = node->rb_left;
> + }
> + }
> +
> + return match;
> +}
On Tue, Apr 23, 2019 at 04:18:18PM +0000, Vineeth Remanan Pillai wrote:
> +// XXX fairness/fwd progress conditions
> +static struct task_struct *
> +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> +{
> + struct task_struct *class_pick, *cookie_pick;
> + unsigned long cookie = 0UL;
> +
> + /*
> + * We must not rely on rq->core->core_cookie here, because we fail to reset
> + * rq->core->core_cookie on new picks, such that we can detect if we need
> + * to do single vs multi rq task selection.
> + */
> +
> + if (max && max->core_cookie) {
> + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> + cookie = max->core_cookie;
> + }
> +
> + class_pick = class->pick_task(rq);
> + if (!cookie)
> + return class_pick;
> +
> + cookie_pick = sched_core_find(rq, cookie);
> + if (!class_pick)
> + return cookie_pick;
> +
> + /*
> + * If class > max && class > cookie, it is the highest priority task on
> + * the core (so far) and it must be selected, otherwise we must go with
> + * the cookie pick in order to satisfy the constraint.
> + */
> + if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
It apapears to me the cpu_prio_less(cookie_pick, class_pick) isn't
needed.
If cookie_pick is idle task, then cpu_prio_less(cookie_pick, class_pick)
is always true;
If cookie_pick is not idle task and has the same sched class as
class_pick, then class_pick is the best candidate to run accoring to
their sched class. In this case, cpu_prio_less(cookie_pick, class_pick)
shouldn't return false or it feels like a bug;
If cookie_pick is not idle task and has a different sched class as
class_pick:
- if cookie_pick's sched class has higher priority than class_pick's
sched class, then cookie_pick should have been selected in previous
sched class iteration; and since its cookie matches with max,
everything should have been finished already;
- if cookie_pick's sched class has lower priority than class_pick's
sched class, then cpu_prio_less(cookie_pick, class_pick) will still
returns true.
So looks like cpu_prio_less(cookie_pick, class_pick) should always
return true and thus not needed.
> + return class_pick;
> +
> + return cookie_pick;
> +}
On 26-Apr-2019 05:03:37 PM, Peter Zijlstra wrote:
> On Tue, Apr 23, 2019 at 04:18:21PM +0000, Vineeth Remanan Pillai wrote:
>
> (you lost From: Julien)
>
> > During core scheduling, it can happen that the current rq selects a
> > non-tagged process while the sibling might be idling even though it
> > had something to run (because the sibling selected idle to match the
> > tagged process in previous tag matching iteration). We need to wake up
> > the sibling if such a situation arise.
> >
> > Signed-off-by: Vineeth Remanan Pillai <[email protected]>
> > Signed-off-by: Julien Desfossez <[email protected]>
> > ---
> > kernel/sched/core.c | 15 +++++++++++++++
> > 1 file changed, 15 insertions(+)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index e8f5ec641d0a..0e3c51a1b54a 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3775,6 +3775,21 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > */
> > if (i == cpu && !rq->core->core_cookie && !p->core_cookie) {
> > next = p;
> > + rq->core_pick = NULL;
> > +
> > + /*
> > + * If the sibling is idling, we might want to wake it
> > + * so that it can check for any runnable tasks that did
> > + * not get a chance to run due to previous task matching.
> > + */
> > + for_each_cpu(j, smt_mask) {
> > + struct rq *rq_j = cpu_rq(j);
> > + rq_j->core_pick = NULL;
> > + if (j != cpu &&
> > + is_idle_task(rq_j->curr) && rq_j->nr_running) {
> > + resched_curr(rq_j);
> > + }
> > + }
> > goto done;
> > }
>
> Anyway, as written here:
>
> https://lkml.kernel.org/r/[email protected]
>
> I think this isn't quite right. Does the below patch (which actually
> removes lines) also work?
>
> As written before; the intent was to not allow that optimization if the
> last pick had a cookie; thereby doing a (last) core wide selection when
> we go to a 0-cookie, and this then includes kicking forced-idle cores.
It works and the performance is similar to our previous solution :-)
Thanks,
Julien
On 2019/4/29 14:14, Ingo Molnar wrote:
>
> * Li, Aubrey <[email protected]> wrote:
>
>>> I suspect it's pretty low, below 1% for all rows?
>>
>> Hope my this mail box works for this...
>>
>> .-------------------------------------------------------------------------------------------------------------.
>> |NA/AVX vanilla-SMT [std% / sem%] | coresched-SMT [std% / sem%] +/- | no-SMT [std% / sem%] +/- |
>> |-------------------------------------------------------------------------------------------------------------|
>> | 1/1 508.5 [ 0.2%/ 0.0%] | 504.7 [ 1.1%/ 0.1%] -0.8%| 509.0 [ 0.2%/ 0.0%] 0.1% |
>> | 2/2 1000.2 [ 1.4%/ 0.1%] | 1004.1 [ 1.6%/ 0.2%] 0.4%| 997.6 [ 1.2%/ 0.1%] -0.3% |
>> | 4/4 1912.1 [ 1.0%/ 0.1%] | 1904.2 [ 1.1%/ 0.1%] -0.4%| 1914.9 [ 1.3%/ 0.1%] 0.1% |
>> | 8/8 3753.5 [ 0.3%/ 0.0%] | 3748.2 [ 0.3%/ 0.0%] -0.1%| 3751.3 [ 0.4%/ 0.0%] -0.1% |
>> | 16/16 7139.3 [ 2.4%/ 0.2%] | 7137.9 [ 1.8%/ 0.2%] -0.0%| 7049.2 [ 2.4%/ 0.2%] -1.3% |
>> | 32/32 10899.0 [ 4.2%/ 0.4%] | 10780.3 [ 4.4%/ 0.4%] -1.1%| 10339.2 [ 9.6%/ 0.9%] -5.1% |
>> | 64/64 15086.1 [11.5%/ 1.2%] | 14262.0 [ 8.2%/ 0.8%] -5.5%| 11168.7 [22.2%/ 1.7%] -26.0% |
>> |128/128 15371.9 [22.0%/ 2.2%] | 14675.8 [14.4%/ 1.4%] -4.5%| 10963.9 [18.5%/ 1.4%] -28.7% |
>> |256/256 15990.8 [22.0%/ 2.2%] | 12227.9 [10.3%/ 1.0%] -23.5%| 10469.9 [19.6%/ 1.7%] -34.5% |
>> '-------------------------------------------------------------------------------------------------------------'
>
> Perfectly presented, thank you very much!
My pleasure! ;-)
>
> My final questin would be about the environment:
>
>> Skylake server, 2 numa nodes, 104 CPUs (HT on)
>
> Is the typical nr_running value the sum of 'NA+AVX', i.e. is it ~256
> threads for the 128/128 row for example - or is it 128 parallel tasks?
That means 128 sysbench threads and 128 gemmbench tasks, so 256 threads in sum.
>
> I.e. showing the approximate CPU thread-load figure column would be very
> useful too, where '50%' shows half-loaded, '100%' fully-loaded, '200%'
> over-saturated, etc. - for each row?
See below, hope this helps.
.--------------------------------------------------------------------------------------------------------------------------------------.
|NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
|--------------------------------------------------------------------------------------------------------------------------------------|
| 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
| 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
| 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
| 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
| 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
| 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
| 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
|128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
|256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
'--------------------------------------------------------------------------------------------------------------------------------------'
Thanks,
-Aubrey
On Mon, Apr 29, 2019 at 09:25:35PM +0800 Li, Aubrey wrote:
> On 2019/4/29 14:14, Ingo Molnar wrote:
> >
> > * Li, Aubrey <[email protected]> wrote:
> >
> >>> I suspect it's pretty low, below 1% for all rows?
> >>
> >> Hope my this mail box works for this...
> >>
> >> .-------------------------------------------------------------------------------------------------------------.
> >> |NA/AVX vanilla-SMT [std% / sem%] | coresched-SMT [std% / sem%] +/- | no-SMT [std% / sem%] +/- |
> >> |-------------------------------------------------------------------------------------------------------------|
> >> | 1/1 508.5 [ 0.2%/ 0.0%] | 504.7 [ 1.1%/ 0.1%] -0.8%| 509.0 [ 0.2%/ 0.0%] 0.1% |
> >> | 2/2 1000.2 [ 1.4%/ 0.1%] | 1004.1 [ 1.6%/ 0.2%] 0.4%| 997.6 [ 1.2%/ 0.1%] -0.3% |
> >> | 4/4 1912.1 [ 1.0%/ 0.1%] | 1904.2 [ 1.1%/ 0.1%] -0.4%| 1914.9 [ 1.3%/ 0.1%] 0.1% |
> >> | 8/8 3753.5 [ 0.3%/ 0.0%] | 3748.2 [ 0.3%/ 0.0%] -0.1%| 3751.3 [ 0.4%/ 0.0%] -0.1% |
> >> | 16/16 7139.3 [ 2.4%/ 0.2%] | 7137.9 [ 1.8%/ 0.2%] -0.0%| 7049.2 [ 2.4%/ 0.2%] -1.3% |
> >> | 32/32 10899.0 [ 4.2%/ 0.4%] | 10780.3 [ 4.4%/ 0.4%] -1.1%| 10339.2 [ 9.6%/ 0.9%] -5.1% |
> >> | 64/64 15086.1 [11.5%/ 1.2%] | 14262.0 [ 8.2%/ 0.8%] -5.5%| 11168.7 [22.2%/ 1.7%] -26.0% |
> >> |128/128 15371.9 [22.0%/ 2.2%] | 14675.8 [14.4%/ 1.4%] -4.5%| 10963.9 [18.5%/ 1.4%] -28.7% |
> >> |256/256 15990.8 [22.0%/ 2.2%] | 12227.9 [10.3%/ 1.0%] -23.5%| 10469.9 [19.6%/ 1.7%] -34.5% |
> >> '-------------------------------------------------------------------------------------------------------------'
> >
> > Perfectly presented, thank you very much!
>
> My pleasure! ;-)
>
> >
> > My final questin would be about the environment:
> >
> >> Skylake server, 2 numa nodes, 104 CPUs (HT on)
> >
> > Is the typical nr_running value the sum of 'NA+AVX', i.e. is it ~256
> > threads for the 128/128 row for example - or is it 128 parallel tasks?
>
> That means 128 sysbench threads and 128 gemmbench tasks, so 256 threads in sum.
> >
> > I.e. showing the approximate CPU thread-load figure column would be very
> > useful too, where '50%' shows half-loaded, '100%' fully-loaded, '200%'
> > over-saturated, etc. - for each row?
>
> See below, hope this helps.
> .--------------------------------------------------------------------------------------------------------------------------------------.
> |NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
> |--------------------------------------------------------------------------------------------------------------------------------------|
> | 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
> | 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
> | 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
> | 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
> | 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
> | 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
> | 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
> |128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
> |256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
> '--------------------------------------------------------------------------------------------------------------------------------------'
>
That's really nice and clear.
We start to see the penalty for the coresched at 32/32, leaving some cpus more idle than otherwise.
But it's pretty good overall, for this benchmark at least.
Is this with stock v2 or with any of the fixes posted after? I wonder how much the fixes for
the race that violates the rule effects this, for example.
Cheers,
Phil
> Thanks,
> -Aubrey
--
* Li, Aubrey <[email protected]> wrote:
> > I.e. showing the approximate CPU thread-load figure column would be
> > very useful too, where '50%' shows half-loaded, '100%' fully-loaded,
> > '200%' over-saturated, etc. - for each row?
>
> See below, hope this helps.
> .--------------------------------------------------------------------------------------------------------------------------------------.
> |NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
> |--------------------------------------------------------------------------------------------------------------------------------------|
> | 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
> | 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
> | 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
> | 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
> | 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
> | 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
> | 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
> |128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
> |256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
> '--------------------------------------------------------------------------------------------------------------------------------------'
Very nice, thank you!
What's interesting is how in the over-saturated case (the last three
rows: 128, 256 and 512 total threads) coresched-SMT leaves 20-30% CPU
performance on the floor according to the load figures.
Is this true idle time (which shows up as 'id' during 'top'), or some
load average artifact?
Ingo
On Mon, Apr 29, 2019 at 11:39 PM Phil Auld <[email protected]> wrote:
>
> On Mon, Apr 29, 2019 at 09:25:35PM +0800 Li, Aubrey wrote:
> > .--------------------------------------------------------------------------------------------------------------------------------------.
> > |NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
> > |--------------------------------------------------------------------------------------------------------------------------------------|
> > | 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
> > | 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
> > | 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
> > | 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
> > | 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
> > | 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
> > | 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
> > |128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
> > |256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
> > '--------------------------------------------------------------------------------------------------------------------------------------'
> >
>
> That's really nice and clear.
>
> We start to see the penalty for the coresched at 32/32, leaving some cpus more idle than otherwise.
> But it's pretty good overall, for this benchmark at least.
>
> Is this with stock v2 or with any of the fixes posted after? I wonder how much the fixes for
> the race that violates the rule effects this, for example.
>
Yeah, this data is based on v2 without any fixes after.
I also tried some fixes potential to performance impact but no luck so far.
Please let me know if anything I missed.
Thanks,
-Aubrey
On Tue, Apr 30, 2019 at 12:01 AM Ingo Molnar <[email protected]> wrote:
> * Li, Aubrey <[email protected]> wrote:
>
> > > I.e. showing the approximate CPU thread-load figure column would be
> > > very useful too, where '50%' shows half-loaded, '100%' fully-loaded,
> > > '200%' over-saturated, etc. - for each row?
> >
> > See below, hope this helps.
> > .--------------------------------------------------------------------------------------------------------------------------------------.
> > |NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
> > |--------------------------------------------------------------------------------------------------------------------------------------|
> > | 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
> > | 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
> > | 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
> > | 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
> > | 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
> > | 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
> > | 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
> > |128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
> > |256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
> > '--------------------------------------------------------------------------------------------------------------------------------------'
>
> Very nice, thank you!
>
> What's interesting is how in the over-saturated case (the last three
> rows: 128, 256 and 512 total threads) coresched-SMT leaves 20-30% CPU
> performance on the floor according to the load figures.
Yeah, I found the next focus.
>
> Is this true idle time (which shows up as 'id' during 'top'), or some
> load average artifact?
>
vmstat periodically reported intermediate CPU utilization in one second, it was
running simultaneously when the benchmarks run. The cpu% is computed by
the average of (100-idle) series.
Thanks,
-Aubrey
* Aubrey Li <[email protected]> wrote:
> On Tue, Apr 30, 2019 at 12:01 AM Ingo Molnar <[email protected]> wrote:
> > * Li, Aubrey <[email protected]> wrote:
> >
> > > > I.e. showing the approximate CPU thread-load figure column would be
> > > > very useful too, where '50%' shows half-loaded, '100%' fully-loaded,
> > > > '200%' over-saturated, etc. - for each row?
> > >
> > > See below, hope this helps.
> > > .--------------------------------------------------------------------------------------------------------------------------------------.
> > > |NA/AVX vanilla-SMT [std% / sem%] cpu% |coresched-SMT [std% / sem%] +/- cpu% | no-SMT [std% / sem%] +/- cpu% |
> > > |--------------------------------------------------------------------------------------------------------------------------------------|
> > > | 1/1 508.5 [ 0.2%/ 0.0%] 2.1% | 504.7 [ 1.1%/ 0.1%] -0.8% 2.1% | 509.0 [ 0.2%/ 0.0%] 0.1% 4.3% |
> > > | 2/2 1000.2 [ 1.4%/ 0.1%] 4.1% | 1004.1 [ 1.6%/ 0.2%] 0.4% 4.1% | 997.6 [ 1.2%/ 0.1%] -0.3% 8.1% |
> > > | 4/4 1912.1 [ 1.0%/ 0.1%] 7.9% | 1904.2 [ 1.1%/ 0.1%] -0.4% 7.9% | 1914.9 [ 1.3%/ 0.1%] 0.1% 15.1% |
> > > | 8/8 3753.5 [ 0.3%/ 0.0%] 14.9% | 3748.2 [ 0.3%/ 0.0%] -0.1% 14.9% | 3751.3 [ 0.4%/ 0.0%] -0.1% 30.5% |
> > > | 16/16 7139.3 [ 2.4%/ 0.2%] 30.3% | 7137.9 [ 1.8%/ 0.2%] -0.0% 30.3% | 7049.2 [ 2.4%/ 0.2%] -1.3% 60.4% |
> > > | 32/32 10899.0 [ 4.2%/ 0.4%] 60.3% | 10780.3 [ 4.4%/ 0.4%] -1.1% 55.9% | 10339.2 [ 9.6%/ 0.9%] -5.1% 97.7% |
> > > | 64/64 15086.1 [11.5%/ 1.2%] 97.7% | 14262.0 [ 8.2%/ 0.8%] -5.5% 82.0% | 11168.7 [22.2%/ 1.7%] -26.0% 100.0% |
> > > |128/128 15371.9 [22.0%/ 2.2%] 100.0% | 14675.8 [14.4%/ 1.4%] -4.5% 82.8% | 10963.9 [18.5%/ 1.4%] -28.7% 100.0% |
> > > |256/256 15990.8 [22.0%/ 2.2%] 100.0% | 12227.9 [10.3%/ 1.0%] -23.5% 73.2% | 10469.9 [19.6%/ 1.7%] -34.5% 100.0% |
> > > '--------------------------------------------------------------------------------------------------------------------------------------'
> >
> > Very nice, thank you!
> >
> > What's interesting is how in the over-saturated case (the last three
> > rows: 128, 256 and 512 total threads) coresched-SMT leaves 20-30% CPU
> > performance on the floor according to the load figures.
>
> Yeah, I found the next focus.
>
> > Is this true idle time (which shows up as 'id' during 'top'), or some
> > load average artifact?
>
> vmstat periodically reported intermediate CPU utilization in one
> second, it was running simultaneously when the benchmarks run. The cpu%
> is computed by the average of (100-idle) series.
Ok - so 'vmstat' uses /proc/stat, which uses cpustat[CPUTIME_IDLE] (or
its NOHZ work-alike), so this should be true idle time - to the extent
the HZ process clock's sampling is accurate.
So I guess the answer to my question is "yes". ;-)
BTW., for robustness sake you might want to add iowait to idle time (it's
the 'wa' field of vmstat) - it shouldn't matter for this particular
benchmark which doesn't do much IO, but it might for others.
Both CPUTIME_IDLE and CPUTIME_IOWAIT are idle states when a CPU is not
utilized.
[ Side note: we should really implement precise idle time accounting when
CONFIG_IRQ_TIME_ACCOUNTING=y is enabled. We pay all the costs of the
timestamps, but AFAICS we don't propagate that into the idle cputime
metrics. ]
Thanks,
Ingo
On 4/28/19 11:15 PM, Aaron Lu wrote:
> On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
>> +/*
>> + * Find left-most (aka, highest priority) task matching @cookie.
>> + */
>> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
>> +{
>> + struct rb_node *node = rq->core_tree.rb_node;
>> + struct task_struct *node_task, *match;
>> +
>> + /*
>> + * The idle task always matches any cookie!
>> + */
>> + match = idle_sched_class.pick_task(rq);
>> +
>> + while (node) {
>> + node_task = container_of(node, struct task_struct, core_node);
>> +
>> + if (node_task->core_cookie < cookie) {
>> + node = node->rb_left;
>
> Should go right here?
>
I think Aaron is correct. We order the rb tree where tasks with smaller core cookies
go to the left part of the tree.
In this case, the cookie we are looking for is larger than the current node's cookie.
It seems like we should move to the right to look for a node with matching cookie.
At least making the following change still allow us to run the system stably for sysbench.
Need to gather more data to see how performance changes.
Tim
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25638a47c408..ed4cfa49e3f2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -208,9 +208,9 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
while (node) {
node_task = container_of(node, struct task_struct, core_node);
- if (node_task->core_cookie < cookie) {
+ if (cookie < node_task->core_cookie) {
node = node->rb_left;
- } else if (node_task->core_cookie > cookie) {
+ } else if (cookie > node_task->core_cookie) {
node = node->rb_right;
} else {
match = node_task;
On 5/1/19 4:27 PM, Tim Chen wrote:
> On 4/28/19 11:15 PM, Aaron Lu wrote:
>> On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
>>> +/*
>>> + * Find left-most (aka, highest priority) task matching @cookie.
>>> + */
>>> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
>>> +{
>>> + struct rb_node *node = rq->core_tree.rb_node;
>>> + struct task_struct *node_task, *match;
>>> +
>>> + /*
>>> + * The idle task always matches any cookie!
>>> + */
>>> + match = idle_sched_class.pick_task(rq);
>>> +
>>> + while (node) {
>>> + node_task = container_of(node, struct task_struct, core_node);
>>> +
>>> + if (node_task->core_cookie < cookie) {
>>> + node = node->rb_left;
>>
>> Should go right here?
>>
>
> I think Aaron is correct. We order the rb tree where tasks with smaller core cookies
> go to the left part of the tree.
>
> In this case, the cookie we are looking for is larger than the current node's cookie.
> It seems like we should move to the right to look for a node with matching cookie.
>
> At least making the following change still allow us to run the system stably for sysbench.
> Need to gather more data to see how performance changes.
Pawan ran an experiment setting up 2 VMs, with one VM doing a parallel kernel build and one VM doing sysbench,
limiting both VMs to run on 16 cpu threads (8 physical cores), with 8 vcpu for each VM.
Making the fix did improve kernel build time by 7%.
Tim
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 25638a47c408..ed4cfa49e3f2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -208,9 +208,9 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> while (node) {
> node_task = container_of(node, struct task_struct, core_node);
>
> - if (node_task->core_cookie < cookie) {
> + if (cookie < node_task->core_cookie) {
> node = node->rb_left;
> - } else if (node_task->core_cookie > cookie) {
> + } else if (cookie > node_task->core_cookie) {
> node = node->rb_right;
> } else {
> match = node_task;
>
>
On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> On Tue, Apr 23, 2019 at 06:45:27PM +0000, Vineeth Remanan Pillai wrote:
> > >> - Processes with different tags can still share the core
> >
> > > I may have missed something... Could you explain this statement?
> >
> > > This, to me, is the whole point of the patch series. If it's not
> > > doing this then ... what?
> >
> > What I meant was, the patch needs some more work to be accurate.
> > There are some race conditions where the core violation can still
> > happen. In our testing, we saw around 1 to 5% of the time being
> > shared with incompatible processes. One example of this happening
> > is as follows(let cpu 0 and 1 be siblings):
> > - cpu 0 selects a process with a cookie
> > - cpu 1 selects a higher priority process without cookie
> > - Selection process restarts for cpu 0 and it might select a
> > process with cookie but with lesser priority.
> > - Since it is lesser priority, the logic in pick_next_task
> > doesn't compare again for the cookie(trusts pick_task) and
> > proceeds.
> >
> > This is one of the scenarios that we saw from traces, but there
> > might be other race conditions as well. Fix seems a little
> > involved and We are working on that.
>
> This is what I have used to make sure no two unmatched tasks being
> scheduled on the same core: (on top of v1, I thinks it's easier to just
> show the diff instead of commenting on various places of the patches :-)
We imported this fix in v2 and made some small changes and optimizations
(with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
and in both cases, the performance problem where the core can end up
idle with tasks in its runqueues came back.
This is pretty easy to reproduce with a multi-file disk write benchmark.
Here is the patch based on your changes applied on v2 (on top of Peter’s
fix):
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 07f3f0c..e09fa25 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3653,6 +3653,13 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
}
// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ * rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
static struct task_struct *
pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
{
@@ -3660,19 +3667,36 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
unsigned long cookie = rq->core->core_cookie;
class_pick = class->pick_task(rq);
- if (!cookie)
+ if (!class_pick)
+ return NULL;
+
+ if (!cookie) {
+ /*
+ * If class_pick is tagged, return it only if it has
+ * higher priority than max.
+ */
+ if (max && class_pick->core_cookie &&
+ core_prio_less(class_pick, max))
+ return idle_sched_class.pick_task(rq);
+
+ return class_pick;
+ }
+
+ /*
+ * If there is a cooke match here, return early.
+ */
+ if (class_pick->core_cookie == cookie)
return class_pick;
cookie_pick = sched_core_find(rq, cookie);
- if (!class_pick)
- return cookie_pick;
/*
* If class > max && class > cookie, it is the highest priority task on
* the core (so far) and it must be selected, otherwise we must go with
* the cookie pick in order to satisfy the constraint.
*/
- if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
+ if (cpu_prio_less(cookie_pick, class_pick) &&
+ (!max || core_prio_less(max, class_pick)))
return class_pick;
return cookie_pick;
@@ -3742,8 +3766,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq_i->core_pick = NULL;
- if (i != cpu)
+ if (i != cpu) {
update_rq_clock(rq_i);
+
+ /*
+ * If a sibling is idle, we can initiate an
+ * unconstrained pick.
+ */
+ if (is_idle_task(rq_i->curr) && prev_cookie)
+ prev_cookie = 0UL;
+ }
}
/*
@@ -3820,12 +3852,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* If this new candidate is of higher priority than the
* previous; and they're incompatible; we need to wipe
- * the slate and start over.
+ * the slate and start over. pick_task makes sure that
+ * p's priority is more than max if it doesn't match
+ * max's cookie.
*
* NOTE: this is a linear max-filter and is thus bounded
* in execution time.
*/
- if (!max || core_prio_less(max, p)) {
+ if (!max || !cookie_match(max, p)) {
struct task_struct *old_max = max;
rq->core->core_cookie = p->core_cookie;
@@ -3833,7 +3867,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
trace_printk("max: %s/%d %lx\n", max->comm, max->pid, max->core_cookie);
- if (old_max && !cookie_match(old_max, p)) {
+ if (old_max) {
for_each_cpu(j, smt_mask) {
if (j == i)
continue;
@@ -3879,6 +3913,23 @@ next_class:;
trace_printk("picked: %s/%d %lx\n", next->comm, next->pid, next->core_cookie);
+ /* make sure we didn't break L1TF */
+ for_each_cpu(i, smt_mask) {
+ struct rq *rq_i = cpu_rq(i);
+ if (i == cpu)
+ continue;
+
+ if (likely(cookie_match(next, rq_i->core_pick)))
+ continue;
+
+ trace_printk("[%d]: cookie mismatch. %s/%d/0x%lx/0x%lx\n",
+ rq_i->cpu, rq_i->core_pick->comm,
+ rq_i->core_pick->pid,
+ rq_i->core_pick->core_cookie,
+ rq_i->core->core_cookie);
+ WARN_ON_ONCE(1);
+ }
+
done:
set_next_task(rq, next);
return next;
On Mon, May 06, 2019 at 03:39:37PM -0400, Julien Desfossez wrote:
> On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> > This is what I have used to make sure no two unmatched tasks being
> > scheduled on the same core: (on top of v1, I thinks it's easier to just
> > show the diff instead of commenting on various places of the patches :-)
>
> We imported this fix in v2 and made some small changes and optimizations
> (with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
> and in both cases, the performance problem where the core can end up
By 'core', do you mean a logical CPU(hyperthread) or the entire core?
> idle with tasks in its runqueues came back.
Assume you meant a hyperthread, then the question is: when a hyperthread
is idle with tasks sitting in its runqueue, do these tasks match with the
other hyperthread's rq->curr? If so, then it is a problem that need to
be addressed; if not, then this is due to the constraint imposed by the
mitigation of L1TF.
Thanks.
On Fri, May 3, 2019 at 8:06 AM Tim Chen <[email protected]> wrote:
>
> On 5/1/19 4:27 PM, Tim Chen wrote:
> > On 4/28/19 11:15 PM, Aaron Lu wrote:
> >> On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
> >>> +/*
> >>> + * Find left-most (aka, highest priority) task matching @cookie.
> >>> + */
> >>> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> >>> +{
> >>> + struct rb_node *node = rq->core_tree.rb_node;
> >>> + struct task_struct *node_task, *match;
> >>> +
> >>> + /*
> >>> + * The idle task always matches any cookie!
> >>> + */
> >>> + match = idle_sched_class.pick_task(rq);
> >>> +
> >>> + while (node) {
> >>> + node_task = container_of(node, struct task_struct, core_node);
> >>> +
> >>> + if (node_task->core_cookie < cookie) {
> >>> + node = node->rb_left;
> >>
> >> Should go right here?
> >>
> >
> > I think Aaron is correct. We order the rb tree where tasks with smaller core cookies
> > go to the left part of the tree.
> >
> > In this case, the cookie we are looking for is larger than the current node's cookie.
> > It seems like we should move to the right to look for a node with matching cookie.
> >
> > At least making the following change still allow us to run the system stably for sysbench.
> > Need to gather more data to see how performance changes.
>
> Pawan ran an experiment setting up 2 VMs, with one VM doing a parallel kernel build and one VM doing sysbench,
> limiting both VMs to run on 16 cpu threads (8 physical cores), with 8 vcpu for each VM.
> Making the fix did improve kernel build time by 7%.
I'm gonna agree with the patch below, but just wonder if the testing
result is consistent,
as I didn't see any improvement in my testing environment.
IIUC, from the code behavior, especially for 2 VMs case(only 2
different cookies), the
per-rq rb tree unlikely has nodes with different cookies, that is, all
the nodes on this
tree should have the same cookie, so:
- if the parameter cookie is equal to the rb tree cookie, we meet a
match and go the
third branch
- else, no matter we go left or right, we can't find a match, and
we'll return idle thread
finally.
Please correct me if I was wrong.
Thanks,
-Aubrey
>
> Tim
>
>
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 25638a47c408..ed4cfa49e3f2 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -208,9 +208,9 @@ static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> > while (node) {
> > node_task = container_of(node, struct task_struct, core_node);
> >
> > - if (node_task->core_cookie < cookie) {
> > + if (cookie < node_task->core_cookie) {
> > node = node->rb_left;
> > - } else if (node_task->core_cookie > cookie) {
> > + } else if (cookie > node_task->core_cookie) {
> > node = node->rb_right;
> > } else {
> > match = node_task;
> >
> >
>
On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
>
> On 5/8/19 8:49 AM, Aubrey Li wrote:
>>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
>>> parallel kernel build and one VM doing sysbench,
>>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
>>> 8 vcpu for each VM.
>>> Making the fix did improve kernel build time by 7%.
>> I'm gonna agree with the patch below, but just wonder if the testing
>> result is consistent,
>> as I didn't see any improvement in my testing environment.
>>
>> IIUC, from the code behavior, especially for 2 VMs case(only 2
>> different cookies), the
>> per-rq rb tree unlikely has nodes with different cookies, that is, all
>> the nodes on this
>> tree should have the same cookie, so:
>> - if the parameter cookie is equal to the rb tree cookie, we meet a
>> match and go the
>> third branch
>> - else, no matter we go left or right, we can't find a match, and
>> we'll return idle thread
>> finally.
>>
>> Please correct me if I was wrong.
>>
>> Thanks,
>> -Aubrey
> This is searching in the per core rb tree (rq->core_tree) which can have
> 2 different cookies. But having said that, even I didn't see any
> improvement with the patch for my DB test case. But logically it is
> correct.
>
Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
why you think it is unlikely?
On 5/8/19 8:49 AM, Aubrey Li wrote:
>> Pawan ran an experiment setting up 2 VMs, with one VM doing a parallel kernel build and one VM doing sysbench,
>> limiting both VMs to run on 16 cpu threads (8 physical cores), with 8 vcpu for each VM.
>> Making the fix did improve kernel build time by 7%.
> I'm gonna agree with the patch below, but just wonder if the testing
> result is consistent,
> as I didn't see any improvement in my testing environment.
>
> IIUC, from the code behavior, especially for 2 VMs case(only 2
> different cookies), the
> per-rq rb tree unlikely has nodes with different cookies, that is, all
> the nodes on this
> tree should have the same cookie, so:
> - if the parameter cookie is equal to the rb tree cookie, we meet a
> match and go the
> third branch
> - else, no matter we go left or right, we can't find a match, and
> we'll return idle thread
> finally.
>
> Please correct me if I was wrong.
>
> Thanks,
> -Aubrey
This is searching in the per core rb tree (rq->core_tree) which can have
2 different cookies. But having said that, even I didn't see any
improvement with the patch for my DB test case. But logically it is
correct.
On 08-May-2019 10:30:09 AM, Aaron Lu wrote:
> On Mon, May 06, 2019 at 03:39:37PM -0400, Julien Desfossez wrote:
> > On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> > > This is what I have used to make sure no two unmatched tasks being
> > > scheduled on the same core: (on top of v1, I thinks it's easier to just
> > > show the diff instead of commenting on various places of the patches :-)
> >
> > We imported this fix in v2 and made some small changes and optimizations
> > (with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
> > and in both cases, the performance problem where the core can end up
>
> By 'core', do you mean a logical CPU(hyperthread) or the entire core?
No I really meant the entire core.
I’m sorry, I should have added a little bit more context. This relates
to a performance issue we saw in v1 and discussed here:
https://lore.kernel.org/lkml/[email protected]/T/#mb9f1f54a99bac468fc5c55b06a9da306ff48e90b
We proposed a fix that solved this, Peter came up with a better one
(https://lkml.org/lkml/2019/4/26/658), but if we add your isolation fix
as posted above, the same problem reappears. Hope this clarifies your
ask.
I hope that we did not miss anything crucial while integrating your fix
on top of v2 + Peter’s fix. The changes are conceptually similar, but we
refactored it slightly to make the logic clear. Please have a look and
let us know
Thanks,
Julien
On Thu, May 9, 2019 at 2:41 AM Subhra Mazumdar
<[email protected]> wrote:
>
>
> On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
> >
> > On 5/8/19 8:49 AM, Aubrey Li wrote:
> >>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
> >>> parallel kernel build and one VM doing sysbench,
> >>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
> >>> 8 vcpu for each VM.
> >>> Making the fix did improve kernel build time by 7%.
> >> I'm gonna agree with the patch below, but just wonder if the testing
> >> result is consistent,
> >> as I didn't see any improvement in my testing environment.
> >>
> >> IIUC, from the code behavior, especially for 2 VMs case(only 2
> >> different cookies), the
> >> per-rq rb tree unlikely has nodes with different cookies, that is, all
> >> the nodes on this
> >> tree should have the same cookie, so:
> >> - if the parameter cookie is equal to the rb tree cookie, we meet a
> >> match and go the
> >> third branch
> >> - else, no matter we go left or right, we can't find a match, and
> >> we'll return idle thread
> >> finally.
> >>
> >> Please correct me if I was wrong.
> >>
> >> Thanks,
> >> -Aubrey
> > This is searching in the per core rb tree (rq->core_tree) which can have
> > 2 different cookies. But having said that, even I didn't see any
> > improvement with the patch for my DB test case. But logically it is
> > correct.
> >
> Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
> why you think it is unlikely?
Yeah, I meant 2 different cookies on the system, but unlikely 2
different cookies
on one same rq.
If I read the source correctly, for the sched_core_balance path, when try to
steal cookie from another CPU, sched_core_find() uses dst's cookie to search
if there is a cookie match in src's rq, and sched_core_find() returns idle or
matched task, and later put this matched task onto dst's rq (activate_task() in
sched_core_find()). At this moment, the nodes on the rq's rb tree should have
same cookies.
Thanks,
-Aubrey
On 5/8/19 5:01 PM, Aubrey Li wrote:
> On Thu, May 9, 2019 at 2:41 AM Subhra Mazumdar
> <[email protected]> wrote:
>>
>> On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
>>> On 5/8/19 8:49 AM, Aubrey Li wrote:
>>>>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
>>>>> parallel kernel build and one VM doing sysbench,
>>>>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
>>>>> 8 vcpu for each VM.
>>>>> Making the fix did improve kernel build time by 7%.
>>>> I'm gonna agree with the patch below, but just wonder if the testing
>>>> result is consistent,
>>>> as I didn't see any improvement in my testing environment.
>>>>
>>>> IIUC, from the code behavior, especially for 2 VMs case(only 2
>>>> different cookies), the
>>>> per-rq rb tree unlikely has nodes with different cookies, that is, all
>>>> the nodes on this
>>>> tree should have the same cookie, so:
>>>> - if the parameter cookie is equal to the rb tree cookie, we meet a
>>>> match and go the
>>>> third branch
>>>> - else, no matter we go left or right, we can't find a match, and
>>>> we'll return idle thread
>>>> finally.
>>>>
>>>> Please correct me if I was wrong.
>>>>
>>>> Thanks,
>>>> -Aubrey
>>> This is searching in the per core rb tree (rq->core_tree) which can have
>>> 2 different cookies. But having said that, even I didn't see any
>>> improvement with the patch for my DB test case. But logically it is
>>> correct.
>>>
>> Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
>> why you think it is unlikely?
> Yeah, I meant 2 different cookies on the system, but unlikely 2
> different cookies
> on one same rq.
>
> If I read the source correctly, for the sched_core_balance path, when try to
> steal cookie from another CPU, sched_core_find() uses dst's cookie to search
> if there is a cookie match in src's rq, and sched_core_find() returns idle or
> matched task, and later put this matched task onto dst's rq (activate_task() in
> sched_core_find()). At this moment, the nodes on the rq's rb tree should have
> same cookies.
>
> Thanks,
> -Aubrey
Yes, but sched_core_find is also called from pick_task to find a local
matching task. The enqueue side logic of the scheduler is unchanged with
core scheduling, so it is possible tasks with different cookies are
enqueued on the same rq. So while searching for a matching task locally
doing it correctly should matter.
On Thu, May 9, 2019 at 8:29 AM Subhra Mazumdar
<[email protected]> wrote:
>
>
> On 5/8/19 5:01 PM, Aubrey Li wrote:
> > On Thu, May 9, 2019 at 2:41 AM Subhra Mazumdar
> > <[email protected]> wrote:
> >>
> >> On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
> >>> On 5/8/19 8:49 AM, Aubrey Li wrote:
> >>>>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
> >>>>> parallel kernel build and one VM doing sysbench,
> >>>>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
> >>>>> 8 vcpu for each VM.
> >>>>> Making the fix did improve kernel build time by 7%.
> >>>> I'm gonna agree with the patch below, but just wonder if the testing
> >>>> result is consistent,
> >>>> as I didn't see any improvement in my testing environment.
> >>>>
> >>>> IIUC, from the code behavior, especially for 2 VMs case(only 2
> >>>> different cookies), the
> >>>> per-rq rb tree unlikely has nodes with different cookies, that is, all
> >>>> the nodes on this
> >>>> tree should have the same cookie, so:
> >>>> - if the parameter cookie is equal to the rb tree cookie, we meet a
> >>>> match and go the
> >>>> third branch
> >>>> - else, no matter we go left or right, we can't find a match, and
> >>>> we'll return idle thread
> >>>> finally.
> >>>>
> >>>> Please correct me if I was wrong.
> >>>>
> >>>> Thanks,
> >>>> -Aubrey
> >>> This is searching in the per core rb tree (rq->core_tree) which can have
> >>> 2 different cookies. But having said that, even I didn't see any
> >>> improvement with the patch for my DB test case. But logically it is
> >>> correct.
> >>>
> >> Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
> >> why you think it is unlikely?
> > Yeah, I meant 2 different cookies on the system, but unlikely 2
> > different cookies
> > on one same rq.
> >
> > If I read the source correctly, for the sched_core_balance path, when try to
> > steal cookie from another CPU, sched_core_find() uses dst's cookie to search
> > if there is a cookie match in src's rq, and sched_core_find() returns idle or
> > matched task, and later put this matched task onto dst's rq (activate_task() in
> > sched_core_find()). At this moment, the nodes on the rq's rb tree should have
> > same cookies.
> >
> > Thanks,
> > -Aubrey
> Yes, but sched_core_find is also called from pick_task to find a local
> matching task.
Can a local searching introduce a different cookies? Where is it from?
> The enqueue side logic of the scheduler is unchanged with
> core scheduling,
But only the task with cookies is placed onto this rb tree?
> so it is possible tasks with different cookies are
> enqueued on the same rq. So while searching for a matching task locally
> doing it correctly should matter.
May I know how exactly?
Thanks,
-Aubrey
On Wed, May 08, 2019 at 01:49:09PM -0400, Julien Desfossez wrote:
> On 08-May-2019 10:30:09 AM, Aaron Lu wrote:
> > On Mon, May 06, 2019 at 03:39:37PM -0400, Julien Desfossez wrote:
> > > On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> > > > This is what I have used to make sure no two unmatched tasks being
> > > > scheduled on the same core: (on top of v1, I thinks it's easier to just
> > > > show the diff instead of commenting on various places of the patches :-)
> > >
> > > We imported this fix in v2 and made some small changes and optimizations
> > > (with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
> > > and in both cases, the performance problem where the core can end up
> >
> > By 'core', do you mean a logical CPU(hyperthread) or the entire core?
> No I really meant the entire core.
>
> I’m sorry, I should have added a little bit more context. This relates
> to a performance issue we saw in v1 and discussed here:
> https://lore.kernel.org/lkml/[email protected]/T/#mb9f1f54a99bac468fc5c55b06a9da306ff48e90b
>
> We proposed a fix that solved this, Peter came up with a better one
> (https://lkml.org/lkml/2019/4/26/658), but if we add your isolation fix
> as posted above, the same problem reappears. Hope this clarifies your
> ask.
It's clear now, thanks.
I don't immediately see how my isolation fix would make your fix stop
working, will need to check. But I'm busy with other stuffs so it will
take a while.
>
> I hope that we did not miss anything crucial while integrating your fix
> on top of v2 + Peter’s fix. The changes are conceptually similar, but we
> refactored it slightly to make the logic clear. Please have a look and
> let us know
I suppose you already have a branch that have all the bits there? I
wonder if you can share that branch somewhere so I can start working on
top of it to make sure we are on the same page?
Also, it would be good if you can share the workload, cmdline options,
how many workers need to start etc. to reproduce this issue.
Thanks.
On 5/8/19 6:38 PM, Aubrey Li wrote:
> On Thu, May 9, 2019 at 8:29 AM Subhra Mazumdar
> <[email protected]> wrote:
>>
>> On 5/8/19 5:01 PM, Aubrey Li wrote:
>>> On Thu, May 9, 2019 at 2:41 AM Subhra Mazumdar
>>> <[email protected]> wrote:
>>>> On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
>>>>> On 5/8/19 8:49 AM, Aubrey Li wrote:
>>>>>>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
>>>>>>> parallel kernel build and one VM doing sysbench,
>>>>>>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
>>>>>>> 8 vcpu for each VM.
>>>>>>> Making the fix did improve kernel build time by 7%.
>>>>>> I'm gonna agree with the patch below, but just wonder if the testing
>>>>>> result is consistent,
>>>>>> as I didn't see any improvement in my testing environment.
>>>>>>
>>>>>> IIUC, from the code behavior, especially for 2 VMs case(only 2
>>>>>> different cookies), the
>>>>>> per-rq rb tree unlikely has nodes with different cookies, that is, all
>>>>>> the nodes on this
>>>>>> tree should have the same cookie, so:
>>>>>> - if the parameter cookie is equal to the rb tree cookie, we meet a
>>>>>> match and go the
>>>>>> third branch
>>>>>> - else, no matter we go left or right, we can't find a match, and
>>>>>> we'll return idle thread
>>>>>> finally.
>>>>>>
>>>>>> Please correct me if I was wrong.
>>>>>>
>>>>>> Thanks,
>>>>>> -Aubrey
>>>>> This is searching in the per core rb tree (rq->core_tree) which can have
>>>>> 2 different cookies. But having said that, even I didn't see any
>>>>> improvement with the patch for my DB test case. But logically it is
>>>>> correct.
>>>>>
>>>> Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
>>>> why you think it is unlikely?
>>> Yeah, I meant 2 different cookies on the system, but unlikely 2
>>> different cookies
>>> on one same rq.
>>>
>>> If I read the source correctly, for the sched_core_balance path, when try to
>>> steal cookie from another CPU, sched_core_find() uses dst's cookie to search
>>> if there is a cookie match in src's rq, and sched_core_find() returns idle or
>>> matched task, and later put this matched task onto dst's rq (activate_task() in
>>> sched_core_find()). At this moment, the nodes on the rq's rb tree should have
>>> same cookies.
>>>
>>> Thanks,
>>> -Aubrey
>> Yes, but sched_core_find is also called from pick_task to find a local
>> matching task.
> Can a local searching introduce a different cookies? Where is it from?
No. I meant the local search uses the same binary search of sched_core_find
so it has to be correct.
>
>> The enqueue side logic of the scheduler is unchanged with
>> core scheduling,
> But only the task with cookies is placed onto this rb tree?
>
>> so it is possible tasks with different cookies are
>> enqueued on the same rq. So while searching for a matching task locally
>> doing it correctly should matter.
> May I know how exactly?
select_task_rq_* seems to be unchanged. So the search logic to find a cpu
to enqueue when a task becomes runnable is same as before and doesn't do
any kind of cookie matching.
>
> Thanks,
> -Aubrey
On Thu, May 9, 2019 at 10:14 AM Subhra Mazumdar
<[email protected]> wrote:
>
>
> On 5/8/19 6:38 PM, Aubrey Li wrote:
> > On Thu, May 9, 2019 at 8:29 AM Subhra Mazumdar
> > <[email protected]> wrote:
> >>
> >> On 5/8/19 5:01 PM, Aubrey Li wrote:
> >>> On Thu, May 9, 2019 at 2:41 AM Subhra Mazumdar
> >>> <[email protected]> wrote:
> >>>> On 5/8/19 11:19 AM, Subhra Mazumdar wrote:
> >>>>> On 5/8/19 8:49 AM, Aubrey Li wrote:
> >>>>>>> Pawan ran an experiment setting up 2 VMs, with one VM doing a
> >>>>>>> parallel kernel build and one VM doing sysbench,
> >>>>>>> limiting both VMs to run on 16 cpu threads (8 physical cores), with
> >>>>>>> 8 vcpu for each VM.
> >>>>>>> Making the fix did improve kernel build time by 7%.
> >>>>>> I'm gonna agree with the patch below, but just wonder if the testing
> >>>>>> result is consistent,
> >>>>>> as I didn't see any improvement in my testing environment.
> >>>>>>
> >>>>>> IIUC, from the code behavior, especially for 2 VMs case(only 2
> >>>>>> different cookies), the
> >>>>>> per-rq rb tree unlikely has nodes with different cookies, that is, all
> >>>>>> the nodes on this
> >>>>>> tree should have the same cookie, so:
> >>>>>> - if the parameter cookie is equal to the rb tree cookie, we meet a
> >>>>>> match and go the
> >>>>>> third branch
> >>>>>> - else, no matter we go left or right, we can't find a match, and
> >>>>>> we'll return idle thread
> >>>>>> finally.
> >>>>>>
> >>>>>> Please correct me if I was wrong.
> >>>>>>
> >>>>>> Thanks,
> >>>>>> -Aubrey
> >>>>> This is searching in the per core rb tree (rq->core_tree) which can have
> >>>>> 2 different cookies. But having said that, even I didn't see any
> >>>>> improvement with the patch for my DB test case. But logically it is
> >>>>> correct.
> >>>>>
> >>>> Ah, my bad. It is per rq. But still can have 2 different cookies. Not sure
> >>>> why you think it is unlikely?
> >>> Yeah, I meant 2 different cookies on the system, but unlikely 2
> >>> different cookies
> >>> on one same rq.
> >>>
> >>> If I read the source correctly, for the sched_core_balance path, when try to
> >>> steal cookie from another CPU, sched_core_find() uses dst's cookie to search
> >>> if there is a cookie match in src's rq, and sched_core_find() returns idle or
> >>> matched task, and later put this matched task onto dst's rq (activate_task() in
> >>> sched_core_find()). At this moment, the nodes on the rq's rb tree should have
> >>> same cookies.
> >>>
> >>> Thanks,
> >>> -Aubrey
> >> Yes, but sched_core_find is also called from pick_task to find a local
> >> matching task.
> > Can a local searching introduce a different cookies? Where is it from?
> No. I meant the local search uses the same binary search of sched_core_find
> so it has to be correct.
> >
> >> The enqueue side logic of the scheduler is unchanged with
> >> core scheduling,
> > But only the task with cookies is placed onto this rb tree?
> >
> >> so it is possible tasks with different cookies are
> >> enqueued on the same rq. So while searching for a matching task locally
> >> doing it correctly should matter.
> > May I know how exactly?
> select_task_rq_* seems to be unchanged. So the search logic to find a cpu
> to enqueue when a task becomes runnable is same as before and doesn't do
> any kind of cookie matching.
Okay, that's true in task wakeup path, and also load_balance seems to pull task
without checking cookie too. But my system is not over loaded when I tested this
patch, so there is none or only one task in rq and on the rq's rb
tree, so this patch
does not make a difference.
The question is, should we do cookie checking for task selecting CPU and load
balance CPU pulling task?
Thanks,
-Aubrey
>> select_task_rq_* seems to be unchanged. So the search logic to find a cpu
>> to enqueue when a task becomes runnable is same as before and doesn't do
>> any kind of cookie matching.
> Okay, that's true in task wakeup path, and also load_balance seems to pull task
> without checking cookie too. But my system is not over loaded when I tested this
> patch, so there is none or only one task in rq and on the rq's rb
> tree, so this patch
> does not make a difference.
I had same hypothesis for my tests.
>
> The question is, should we do cookie checking for task selecting CPU and load
> balance CPU pulling task?
The basic issue is keeping the CPUs busy. In case of overloaded system,
the trivial new idle balancer should be able to find a matching task
in case of forced idle. More problematic is the lower load scenario when
there aren't any matching task to be found but there are runnable tasks of
other groups. Also wake up code path tries to balance threads across cores
(select_idle_core) first which is opposite of what core scheduling wants.
I will re-run my tests with select_idle_core disabled, but the issue is
on x86 Intel systems (my test rig) the CPU ids are interleaved across cores
so even select_idle_cpu will balance across cores first. May be others have
some better ideas?
>
> Thanks,
> -Aubrey
On 5/9/19 10:50 AM, Subhra Mazumdar wrote:
>
>>> select_task_rq_* seems to be unchanged. So the search logic to find a cpu
>>> to enqueue when a task becomes runnable is same as before and doesn't do
>>> any kind of cookie matching.
>> Okay, that's true in task wakeup path, and also load_balance seems to pull task
>> without checking cookie too. But my system is not over loaded when I tested this
>> patch, so there is none or only one task in rq and on the rq's rb
>> tree, so this patch
>> does not make a difference.
> I had same hypothesis for my tests.
>>
>> The question is, should we do cookie checking for task selecting CPU and load
>> balance CPU pulling task?
> The basic issue is keeping the CPUs busy. In case of overloaded system,
> the trivial new idle balancer should be able to find a matching task
> in case of forced idle. More problematic is the lower load scenario when
> there aren't any matching task to be found but there are runnable tasks of
> other groups. Also wake up code path tries to balance threads across cores
> (select_idle_core) first which is opposite of what core scheduling wants.
> I will re-run my tests with select_idle_core disabled, but the issue is
> on x86 Intel systems (my test rig) the CPU ids are interleaved across cores
> so even select_idle_cpu will balance across cores first. May be others have
> some better ideas?
>>
We did an experiment on a coffee lake desktop that has 6 cores to see how load
balancing works for core scheduling.
In a nutshell, it seems like for workload like sysbench that are constant
and doesn't have much sleep/wakeups, load balancer is doing a pretty
good job, right on the money. However, when we are overcommiting the
cpus heavily, and the load is non-constant with I/Os and lots of forks
like doing kernel build, it is much harder to get tasks placed optimally.
We set up two VMs, each in its own cgroup. In one VM, we run the
benchmark. In the other VM, we run a cpu hog task for each vcpu to
provide a constant background load.
The HT on case with no core scheduling is used as baseline performance.
There are 6 cores on Coffee Lake test system. We pick 3, 6 and 12
vcpu cases for each VM to look at the 1/2 occupied, fully occupied
and 2x occupied system when HT is used.
Sysbench (Great for core sched)
Core Sched HT off
------ ----------
avg perf (std dev) avg perf (std dev)
3vcpu/VM +0.37% (0.18%) -1.52% (0.17%)
6vcpu/VM -3.36% (2.04%) -31.72% (0.13%)
12vcpu/VM +1.02% (1.17%) -31.03% (0.07%)
Kernel build (Difficult for core sched)
Core Sched HT off
------ ----------
avg perf (std dev) avg perf (std dev)
3vcpu/VM +0.05% (1.21%) -3.66% (0.81%)
6vcpu/VM -30.41% (3.03%) -40.73% (1.53%)
12vcpu/VM -34.03% (2.77%) -24.87% (1.22%)
Tim
On Mon, Apr 29, 2019 at 11:36:22AM +0800, Aaron Lu wrote:
> On Tue, Apr 23, 2019 at 04:18:16PM +0000, Vineeth Remanan Pillai wrote:
> > +/*
> > + * l(a,b)
> > + * le(a,b) := !l(b,a)
> > + * g(a,b) := l(b,a)
> > + * ge(a,b) := !l(a,b)
> > + */
> > +
> > +/* real prio, less is less */
> > +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
> > +{
> > + u64 vruntime;
> > +
> > + int pa = __task_prio(a), pb = __task_prio(b);
> > +
> > + if (-pa < -pb)
> > + return true;
> > +
> > + if (-pb < -pa)
> > + return false;
> > +
> > + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> > + return !dl_time_before(a->dl.deadline, b->dl.deadline);
> > +
> > + vruntime = b->se.vruntime;
> > + if (core_cmp) {
> > + vruntime -= task_cfs_rq(b)->min_vruntime;
> > + vruntime += task_cfs_rq(a)->min_vruntime;
> > + }
> > + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> > + return !((s64)(a->se.vruntime - vruntime) <= 0);
> > +
> > + return false;
> > +}
>
> This unfortunately still doesn't work.
>
> Consider the following task layout on two sibling CPUs(cpu0 and cpu1):
>
> rq0.cfs_rq rq1.cfs_rq
> | |
> se_bash se_hog
>
> se_hog is the sched_entity for a cpu intensive task and se_bash is the
> sched_entity for bash.
>
> There are two problems:
> 1 SCHED_DEBIT
> when user execute some commands through bash, say ls, bash will fork.
> The newly forked ls' vruntime is set in the future due to SCHED_DEBIT.
> This made 'ls' lose in __prio_less() when compared with hog, whose
> vruntime may very likely be the same as its cfs_rq's min_vruntime.
>
> This is OK since we do not want forked process to starve already running
> ones. The problem is, since hog keeps running, its vruntime will always
> sync with its cfs_rq's min_vruntime. OTOH, 'ls' can not run, its
> cfs_rq's min_vruntime doesn't proceed, making 'ls' always lose to hog.
>
> 2 who schedules, who wins
> so I disabled SCHED_DEBIT, for testing's purpose. When cpu0 schedules,
> ls could win where both sched_entity's vruntime is the same as their
> cfs_rqs' min_vruntime. So does hog: when cpu1 schedules, hog can preempt
> ls in the same way. The end result is, interactive task can lose to cpu
> intensive task and ls can feel "dead".
>
> I haven't figured out a way to solve this yet. A core wide cfs_rq's
> min_vruntime can probably solve this. Your suggestions are appreciated.
multi-queue virtual time is 'interesting'. I worked it out once and then
my head hurt, I've forgotten the details again. Esp. when combined with
affinity masks the simple things don't work right. For every
non-feasible weight scenario it comes apart.
I know pjt has an approximation somewhere that might work for us; but I
forgot those details again too.
On possible hack would be to allow min_vruntime to go backwards when
there is only a single task present; basically have min_vruntime =
p->vruntime when you enqueue the first task.
On 23-Apr-2019 04:18:17 PM, Vineeth Remanan Pillai wrote:
> From: Peter Zijlstra (Intel) <[email protected]>
>
> Marks all tasks in a cgroup as matching for core-scheduling.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 62 ++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 4 +++
> 2 files changed, 66 insertions(+)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5066a1493acf..e5bdc1c4d8d7 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6658,6 +6658,15 @@ static void sched_change_group(struct task_struct *tsk, int type)
> tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
> struct task_group, css);
> tg = autogroup_task_group(tsk, tg);
> +
> +#ifdef CONFIG_SCHED_CORE
> + if ((unsigned long)tsk->sched_task_group == tsk->core_cookie)
> + tsk->core_cookie = 0UL;
> +
> + if (tg->tagged /* && !tsk->core_cookie ? */)
> + tsk->core_cookie = (unsigned long)tg;
> +#endif
> +
> tsk->sched_task_group = tg;
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -7117,6 +7126,43 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
> }
> #endif /* CONFIG_RT_GROUP_SCHED */
>
> +#ifdef CONFIG_SCHED_CORE
> +static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
> +{
> + struct task_group *tg = css_tg(css);
> +
> + return !!tg->tagged;
> +}
> +
> +static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
> +{
> + struct task_group *tg = css_tg(css);
> + struct css_task_iter it;
> + struct task_struct *p;
> +
> + if (val > 1)
> + return -ERANGE;
> +
> + if (tg->tagged == !!val)
> + return 0;
> +
> + tg->tagged = !!val;
> +
> + if (!!val)
> + sched_core_get();
> +
> + css_task_iter_start(css, 0, &it);
> + while ((p = css_task_iter_next(&it)))
> + p->core_cookie = !!val ? (unsigned long)tg : 0UL;
> + css_task_iter_end(&it);
> +
> + if (!val)
> + sched_core_put();
> +
> + return 0;
> +}
> +#endif
> +
> static struct cftype cpu_legacy_files[] = {
> #ifdef CONFIG_FAIR_GROUP_SCHED
> {
> @@ -7152,6 +7198,14 @@ static struct cftype cpu_legacy_files[] = {
> .read_u64 = cpu_rt_period_read_uint,
> .write_u64 = cpu_rt_period_write_uint,
> },
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + {
> + .name = "tag",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = cpu_core_tag_read_u64,
> + .write_u64 = cpu_core_tag_write_u64,
> + },
> #endif
> { } /* Terminate */
> };
> @@ -7319,6 +7373,14 @@ static struct cftype cpu_files[] = {
> .seq_show = cpu_max_show,
> .write = cpu_max_write,
> },
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> + {
> + .name = "tag",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = cpu_core_tag_read_u64,
> + .write_u64 = cpu_core_tag_write_u64,
> + },
> #endif
> { } /* terminate */
> };
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 42dd620797d7..16fb236eab7b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -363,6 +363,10 @@ struct cfs_bandwidth {
> struct task_group {
> struct cgroup_subsys_state css;
>
> +#ifdef CONFIG_SCHED_CORE
> + int tagged;
> +#endif
> +
> #ifdef CONFIG_FAIR_GROUP_SCHED
> /* schedulable entities of this group on each CPU */
> struct sched_entity **se;
> --
> 2.17.1
Even though this may not be the definitive interface, a quick fix to
remove the tag if it was set and the cgroup is getting removed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6dc072c..be981e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7190,6 +7190,18 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_SCHED_CORE
+ struct task_group *tg = css_tg(css);
+
+ if (tg->tagged) {
+ sched_core_put();
+ tg->tagged = 0;
+ }
+#endif
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -7832,6 +7844,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
> It's clear now, thanks.
> I don't immediately see how my isolation fix would make your fix stop
> working, will need to check. But I'm busy with other stuffs so it will
> take a while.
>
We have identified the issue and have a fix for this. The issue is
same as before, forced idle sibling has a runnable process which
is starved due to an unconstrained pick bug.
One sample scenario is like this:
cpu0 and cpu1 are siblings. cpu0 selects an untagged process 'a'
which forces idle on cpu1 even though it had a runnable tagged
process 'b' which is determined by the code to be of lesser priority.
cpu1 can go to deep idle.
During the next schedule in cpu0, the following could happen:
- cpu0 selects swapper as there is nothing to run and hence
prev_cookie is 0, it does an unconstrained pick of swapper.
So both cpu0 and 1 are idling and cpu1 might be deep idle.
- cpu0 again goes to schedule and selects 'a' which is runnable
now. since prev_cookie is 0, 'a' is an unconstrained pick and
'b' on cpu1 is forgotten again.
This continues with swapper and process 'a' taking turns without
considering sibling until a tagged process becomes runnable in cpu0
and then we don't get into unconstrained pick.
The above is one of the couple of scenarios we have seen and each
have a slightly different path, which ultimately leads to an
unconstrianed pick, starving the sibling's runnable thread.
The fix is to mark if a core has gone forced idle when there was a
runnable process and then do not do uncontrained pick if a forced
idle happened in the last pick.
I am attaching here wth, the patch that fixes the above issue. Patch
is on top of Peter's fix and your correctness fix that we modified for
v2. We have a public reposiory with all the changes including this
fix as well:
https://github.com/digitalocean/linux-coresched/tree/coresched
We are working on a v3 where the last 3 commits will be squashed to
their related patches in v2. We hope to come up with a v3 next week
with all the suggestions and fixes posted in v2.
Thanks,
Vineeth
---
kernel/sched/core.c | 26 ++++++++++++++++++++++----
kernel/sched/sched.h | 1 +
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 413d46bde17d..3aba0f8fe384 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3653,8 +3653,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
- unsigned long prev_cookie;
int i, j, cpu, occ = 0;
+ bool need_sync = false;
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
@@ -3702,7 +3702,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 'Fix' this by also increasing @task_seq for every pick.
*/
rq->core->core_task_seq++;
- prev_cookie = rq->core->core_cookie;
+ need_sync = !!rq->core->core_cookie;
/* reset state */
rq->core->core_cookie = 0UL;
@@ -3711,6 +3711,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq_i->core_pick = NULL;
+ if (rq_i->core_forceidle) {
+ need_sync = true;
+ rq_i->core_forceidle = false;
+ }
+
if (i != cpu)
update_rq_clock(rq_i);
}
@@ -3743,7 +3748,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* If there weren't no cookies; we don't need
* to bother with the other siblings.
*/
- if (i == cpu && !prev_cookie)
+ if (i == cpu && !need_sync)
goto next_class;
continue;
@@ -3753,7 +3758,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Optimize the 'normal' case where there aren't any
* cookies and we don't need to sync up.
*/
- if (i == cpu && !prev_cookie && !p->core_cookie) {
+ if (i == cpu && !need_sync && !p->core_cookie) {
next = p;
rq->core_pick = NULL;
rq->core->core_cookie = 0UL;
@@ -3816,7 +3821,16 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
occ = 1;
goto again;
+ } else {
+ /*
+ * Once we select a task for a cpu, we
+ * should not be doing an unconstrained
+ * pick because it might starve a task
+ * on a forced idle cpu.
+ */
+ need_sync = true;
}
+
}
}
next_class:;
@@ -3843,6 +3857,9 @@ next_class:;
WARN_ON_ONCE(!rq_i->core_pick);
+ if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
+ rq->core_forceidle = true;
+
rq_i->core_pick->core_occupation = occ;
if (i == cpu)
@@ -6746,6 +6763,7 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
+ rq->core_forceidle = false;
rq->core_cookie = 0UL;
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f38d4149443b..74c29afa0f32 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -964,6 +964,7 @@ struct rq {
unsigned int core_enabled;
unsigned int core_sched_seq;
struct rb_root core_tree;
+ bool core_forceidle;
/* shared state */
unsigned int core_task_seq;
--
2.17.1
> Thanks for pointing this out. I think the ideal fix would be to
> correctly initialize/cleanup the coresched attributes in the cpu
> hotplug code path so that lock could be taken successfully if the
> sibling is offlined/onlined after coresched was enabled. We are
> working on another bug related to hotplugpath and shall introduce
> the fix in v3.
>
A possible fix for handling the runqueues during cpu offline/online
is attached here with.
Thanks,
Vineeth
---
kernel/sched/core.c | 28 +++++++++++++++++++++++++---
1 file changed, 25 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8e5f26db052..1a809849a1e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -253,7 +253,7 @@ static int __sched_core_stopper(void *data)
bool enabled = !!(unsigned long)data;
int cpu;
- for_each_possible_cpu(cpu)
+ for_each_online_cpu(cpu)
cpu_rq(cpu)->core_enabled = enabled;
return 0;
@@ -3764,6 +3764,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct rq *rq_i = cpu_rq(i);
struct task_struct *p;
+ if (cpu_is_offline(i))
+ continue;
+
if (rq_i->core_pick)
continue;
@@ -3866,6 +3869,9 @@ next_class:;
for_each_cpu(i, smt_mask) {
struct rq *rq_i = cpu_rq(i);
+ if (cpu_is_offline(i))
+ continue;
+
WARN_ON_ONCE(!rq_i->core_pick);
rq_i->core_pick->core_occupation = occ;
@@ -6410,8 +6416,14 @@ int sched_cpu_activate(unsigned int cpu)
/*
* When going up, increment the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
static_branch_inc_cpuslocked(&sched_smt_present);
+#ifdef CONFIG_SCHED_CORE
+ if (static_branch_unlikely(&__sched_core_enabled)) {
+ rq->core_enabled = true;
+ }
+#endif
+ }
#endif
set_cpu_active(cpu, true);
@@ -6459,8 +6471,15 @@ int sched_cpu_deactivate(unsigned int cpu)
/*
* When going down, decrement the number of cores with SMT present.
*/
- if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) {
+#ifdef CONFIG_SCHED_CORE
+ struct rq *rq = cpu_rq(cpu);
+ if (static_branch_unlikely(&__sched_core_enabled)) {
+ rq->core_enabled = false;
+ }
+#endif
static_branch_dec_cpuslocked(&sched_smt_present);
+ }
#endif
if (!sched_smp_initialized)
@@ -6537,6 +6556,9 @@ int sched_cpu_dying(unsigned int cpu)
update_max_interval();
nohz_balance_exit_idle(rq);
hrtick_clear(rq);
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+#endif
return 0;
}
#endif
--
2.17.1
On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
<[email protected]> wrote:
>
> From: Peter Zijlstra (Intel) <[email protected]>
>
> Not-Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 38 +++++++++++++++++++++++++++++++++++++-
> 1 file changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 0e3c51a1b54a..e8e5f26db052 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -106,6 +106,10 @@ static inline bool __prio_less(struct task_struct *a, struct task_struct *b, boo
>
> int pa = __task_prio(a), pb = __task_prio(b);
>
> + trace_printk("(%s/%d;%d,%Lu,%Lu) ?< (%s/%d;%d,%Lu,%Lu)\n",
> + a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
> + b->comm, b->pid, pa, b->se.vruntime, b->dl.deadline);
> +
a minor nitpick
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e3162f..68c518c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -93,7 +93,7 @@ static inline bool __prio_less(struct task_struct
*a, struct task_struct *b, u64
trace_printk("(%s/%d;%d,%Lu,%Lu) ?< (%s/%d;%d,%Lu,%Lu)\n",
a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
- b->comm, b->pid, pa, b->se.vruntime, b->dl.deadline);
+ b->comm, b->pid, pb, b->se.vruntime, b->dl.deadline);
if (-pa < -pb)
return true;
On 2019/4/30 12:42, Ingo Molnar wrote:
>
>>> What's interesting is how in the over-saturated case (the last three
>>> rows: 128, 256 and 512 total threads) coresched-SMT leaves 20-30% CPU
>>> performance on the floor according to the load figures.
>>
Sorry for a delay, I got a chance to obtain some profiling results. Here
is the story on my side. I still used the previous testing 128/128 case
(256 threads totally), and focus on CPU53(randomly pickup) only.
Firstly, mpstat reports cpu utilization,
- baseline is 100%,
- coresched-SMT is 87.51%
Then I traced sched_switch trace point, in 100s sampling period,
- baseline context switch 14083 times, next task idle 0 times
- coresched-SMT context switch 15101 times, next task idle 880 times
So I guess pick_next_task() is mostly the interesting place, then I
dig into the trace log on coresched-SMT case:
- CPU53 selected idle task 767 times (matched with the data of sched_switch)
There are 3 branches of CPU53 selecting idle task in pick_next_task():
- pick pre selected 765 times
- unconstrained pick 1 times
- picked: swapper/53/0 1 times
Where CPU53's "pick pre selected idle task" from? I guess its from its
brother CPU1, so I checked CPU1's trace log and found:
- CPU1 helped its sibling CPU53 select idle task 800 times
So for CPU53, the most interesting part occurs in pick_task(), that is:
-The sibling CPU1 helped to select idle task in pick_task()
Forgive me to paste this routine() here:
=====================================================
+// XXX fairness/fwd progress conditions
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
+{
+ struct task_struct *class_pick, *cookie_pick;
+ unsigned long cookie = 0UL;
+
+ /*
+ * We must not rely on rq->core->core_cookie here, because we fail to reset
+ * rq->core->core_cookie on new picks, such that we can detect if we need
+ * to do single vs multi rq task selection.
+ */
+
+ if (max && max->core_cookie) {
+ WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
+ cookie = max->core_cookie;
+ }
+
+ class_pick = class->pick_task(rq);
+ if (!cookie)
+ return class_pick;
+
+ cookie_pick = sched_core_find(rq, cookie);
+ if (!class_pick)
+ return cookie_pick;
+
+ /*
+ * If class > max && class > cookie, it is the highest priority task on
+ * the core (so far) and it must be selected, otherwise we must go with
+ * the cookie pick in order to satisfy the constraint.
+ */
+ if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
+ return class_pick;
+
+ return cookie_pick;
+}
=================================================================
And the most related log of the case:
=================================================================
<...>-21553 [001] dN.. 87341.514992: __schedule: cpu(1): selected: gemmbench/21294 ffff888823df8900
<...>-21553 [001] dN.. 87341.514992: __schedule: max: gemmbench/21294 ffff888823df8900
<...>-21553 [001] dN.. 87341.514995: __schedule: (swapper/53/0;140,0,0) ?< (sysbench/21503;140,457178607302,0)
<...>-21553 [001] dN.. 87341.514996: __schedule: (gemmbench/21294;119,219715519947,0) ?< (sysbench/21503;119,457178607302,0)
<...>-21553 [001] dN.. 87341.514996: __schedule: cpu(53): selected: swapper/53/0 0
It said,
- CPU1 selected gemmbench for itself
- and gemmbench was assigned to max of this core
- then CPU1 helped CPU53 to pick_task()
-- CPU1 used class->pick_task(), selected sysbench for CPU53
-- CPU1 used cookie_pick, selected swapper(idle task) for CPU53
-- the class_pick(sysbench) unfortunately didn't pass the priority check
- idle task picked up at the end(sadly).
So, I think if we want to improve CPU utilization under this scenario,
the straightforward tweak is picking up class_pick if cookie_pick is idle.
But I know, this is a violation of the design philosophy(avoid L1TF) of
this proposal.
Does it make sense to add a knob to switch security/performance?
Welcome any comments!
Thanks,
-Aubrey
On 2019/5/18 8:58, Li, Aubrey wrote:
> On 2019/4/30 12:42, Ingo Molnar wrote:
>>
>>>> What's interesting is how in the over-saturated case (the last three
>>>> rows: 128, 256 and 512 total threads) coresched-SMT leaves 20-30% CPU
>>>> performance on the floor according to the load figures.
>>>
>
> Sorry for a delay, I got a chance to obtain some profiling results. Here
> is the story on my side. I still used the previous testing 128/128 case
> (256 threads totally), and focus on CPU53(randomly pickup) only.
>
> Firstly, mpstat reports cpu utilization,
> - baseline is 100%,
> - coresched-SMT is 87.51%
>
> Then I traced sched_switch trace point, in 100s sampling period,
> - baseline context switch 14083 times, next task idle 0 times
> - coresched-SMT context switch 15101 times, next task idle 880 times
>
> So I guess pick_next_task() is mostly the interesting place, then I
> dig into the trace log on coresched-SMT case:
> - CPU53 selected idle task 767 times (matched with the data of sched_switch)
>
> There are 3 branches of CPU53 selecting idle task in pick_next_task():
> - pick pre selected 765 times
> - unconstrained pick 1 times
> - picked: swapper/53/0 1 times
>
> Where CPU53's "pick pre selected idle task" from? I guess its from its
> brother CPU1, so I checked CPU1's trace log and found:
> - CPU1 helped its sibling CPU53 select idle task 800 times
>
> So for CPU53, the most interesting part occurs in pick_task(), that is:
> -The sibling CPU1 helped to select idle task in pick_task()
>
> Forgive me to paste this routine() here:
> =====================================================
> +// XXX fairness/fwd progress conditions
> +static struct task_struct *
> +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> +{
> + struct task_struct *class_pick, *cookie_pick;
> + unsigned long cookie = 0UL;
> +
> + /*
> + * We must not rely on rq->core->core_cookie here, because we fail to reset
> + * rq->core->core_cookie on new picks, such that we can detect if we need
> + * to do single vs multi rq task selection.
> + */
> +
> + if (max && max->core_cookie) {
> + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> + cookie = max->core_cookie;
> + }
> +
> + class_pick = class->pick_task(rq);
> + if (!cookie)
> + return class_pick;
> +
> + cookie_pick = sched_core_find(rq, cookie);
> + if (!class_pick)
> + return cookie_pick;
> +
> + /*
> + * If class > max && class > cookie, it is the highest priority task on
> + * the core (so far) and it must be selected, otherwise we must go with
> + * the cookie pick in order to satisfy the constraint.
> + */
> + if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
> + return class_pick;
> +
> + return cookie_pick;
> +}
> =================================================================
>
> And the most related log of the case:
> =================================================================
> <...>-21553 [001] dN.. 87341.514992: __schedule: cpu(1): selected: gemmbench/21294 ffff888823df8900
> <...>-21553 [001] dN.. 87341.514992: __schedule: max: gemmbench/21294 ffff888823df8900
> <...>-21553 [001] dN.. 87341.514995: __schedule: (swapper/53/0;140,0,0) ?< (sysbench/21503;140,457178607302,0)
> <...>-21553 [001] dN.. 87341.514996: __schedule: (gemmbench/21294;119,219715519947,0) ?< (sysbench/21503;119,457178607302,0)
> <...>-21553 [001] dN.. 87341.514996: __schedule: cpu(53): selected: swapper/53/0 0
>
> It said,
> - CPU1 selected gemmbench for itself
> - and gemmbench was assigned to max of this core
> - then CPU1 helped CPU53 to pick_task()
> -- CPU1 used class->pick_task(), selected sysbench for CPU53
> -- CPU1 used cookie_pick, selected swapper(idle task) for CPU53
> -- the class_pick(sysbench) unfortunately didn't pass the priority check
> - idle task picked up at the end(sadly).
>
> So, I think if we want to improve CPU utilization under this scenario,
> the straightforward tweak is picking up class_pick if cookie_pick is idle.
Another quick thought is, in CPU53's own path of pick_next_task, give up
pre selected(by CPU1) if pre selected is idle?
> But I know, this is a violation of the design philosophy(avoid L1TF) of
> this proposal.
>
> Does it make sense to add a knob to switch security/performance?
> Welcome any comments!
>
> Thanks,
> -Aubrey
>
On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
<[email protected]> wrote:
>
> From: Peter Zijlstra (Intel) <[email protected]>
>
> Instead of only selecting a local task, select a task for all SMT
> siblings for every reschedule on the core (irrespective which logical
> CPU does the reschedule).
>
> NOTE: there is still potential for siblings rivalry.
> NOTE: this is far too complicated; but thus far I've failed to
> simplify it further.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 222 ++++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/sched.h | 5 +-
> 2 files changed, 224 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e5bdc1c4d8d7..9e6e90c6f9b9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3574,7 +3574,7 @@ static inline void schedule_debug(struct task_struct *prev)
> * Pick up the highest-prio task:
> */
> static inline struct task_struct *
> -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> const struct sched_class *class;
> struct task_struct *p;
> @@ -3619,6 +3619,220 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> BUG();
> }
>
> +#ifdef CONFIG_SCHED_CORE
> +
> +static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> +{
> + if (is_idle_task(a) || is_idle_task(b))
> + return true;
> +
> + return a->core_cookie == b->core_cookie;
> +}
> +
> +// XXX fairness/fwd progress conditions
> +static struct task_struct *
> +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> +{
> + struct task_struct *class_pick, *cookie_pick;
> + unsigned long cookie = 0UL;
> +
> + /*
> + * We must not rely on rq->core->core_cookie here, because we fail to reset
> + * rq->core->core_cookie on new picks, such that we can detect if we need
> + * to do single vs multi rq task selection.
> + */
> +
> + if (max && max->core_cookie) {
> + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> + cookie = max->core_cookie;
> + }
> +
> + class_pick = class->pick_task(rq);
> + if (!cookie)
> + return class_pick;
> +
> + cookie_pick = sched_core_find(rq, cookie);
> + if (!class_pick)
> + return cookie_pick;
> +
> + /*
> + * If class > max && class > cookie, it is the highest priority task on
> + * the core (so far) and it must be selected, otherwise we must go with
> + * the cookie pick in order to satisfy the constraint.
> + */
> + if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
> + return class_pick;
> +
> + return cookie_pick;
> +}
> +
> +static struct task_struct *
> +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> +{
> + struct task_struct *next, *max = NULL;
> + const struct sched_class *class;
> + const struct cpumask *smt_mask;
> + int i, j, cpu;
> +
> + if (!sched_core_enabled(rq))
> + return __pick_next_task(rq, prev, rf);
> +
> + /*
> + * If there were no {en,de}queues since we picked (IOW, the task
> + * pointers are all still valid), and we haven't scheduled the last
> + * pick yet, do so now.
> + */
> + if (rq->core->core_pick_seq == rq->core->core_task_seq &&
> + rq->core->core_pick_seq != rq->core_sched_seq) {
> + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
> +
> + next = rq->core_pick;
> + if (next != prev) {
> + put_prev_task(rq, prev);
> + set_next_task(rq, next);
> + }
> + return next;
> + }
> +
The following patch improved my test cases.
Welcome any comments.
Thanks,
-Aubrey
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e3162f..86031f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3685,10 +3685,12 @@ pick_next_task(struct rq *rq, struct
task_struct *prev, struct rq_flags *rf)
/*
* If there were no {en,de}queues since we picked (IOW, the task
* pointers are all still valid), and we haven't scheduled the last
- * pick yet, do so now.
+ * pick yet, do so now. If the last pick is idle task, we abandon
+ * last pick and try to pick up task this time.
*/
if (rq->core->core_pick_seq == rq->core->core_task_seq &&
- rq->core->core_pick_seq != rq->core_sched_seq) {
+ rq->core->core_pick_seq != rq->core_sched_seq &&
+ !is_idle_task(rq->core_pick)) {
WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick;
On Sat, May 18, 2019 at 11:37:56PM +0800 Aubrey Li wrote:
> On Wed, Apr 24, 2019 at 12:18 AM Vineeth Remanan Pillai
> <[email protected]> wrote:
> >
> > From: Peter Zijlstra (Intel) <[email protected]>
> >
> > Instead of only selecting a local task, select a task for all SMT
> > siblings for every reschedule on the core (irrespective which logical
> > CPU does the reschedule).
> >
> > NOTE: there is still potential for siblings rivalry.
> > NOTE: this is far too complicated; but thus far I've failed to
> > simplify it further.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > ---
> > kernel/sched/core.c | 222 ++++++++++++++++++++++++++++++++++++++++++-
> > kernel/sched/sched.h | 5 +-
> > 2 files changed, 224 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index e5bdc1c4d8d7..9e6e90c6f9b9 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3574,7 +3574,7 @@ static inline void schedule_debug(struct task_struct *prev)
> > * Pick up the highest-prio task:
> > */
> > static inline struct task_struct *
> > -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > {
> > const struct sched_class *class;
> > struct task_struct *p;
> > @@ -3619,6 +3619,220 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > BUG();
> > }
> >
> > +#ifdef CONFIG_SCHED_CORE
> > +
> > +static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
> > +{
> > + if (is_idle_task(a) || is_idle_task(b))
> > + return true;
> > +
> > + return a->core_cookie == b->core_cookie;
> > +}
> > +
> > +// XXX fairness/fwd progress conditions
> > +static struct task_struct *
> > +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max)
> > +{
> > + struct task_struct *class_pick, *cookie_pick;
> > + unsigned long cookie = 0UL;
> > +
> > + /*
> > + * We must not rely on rq->core->core_cookie here, because we fail to reset
> > + * rq->core->core_cookie on new picks, such that we can detect if we need
> > + * to do single vs multi rq task selection.
> > + */
> > +
> > + if (max && max->core_cookie) {
> > + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> > + cookie = max->core_cookie;
> > + }
> > +
> > + class_pick = class->pick_task(rq);
> > + if (!cookie)
> > + return class_pick;
> > +
> > + cookie_pick = sched_core_find(rq, cookie);
> > + if (!class_pick)
> > + return cookie_pick;
> > +
> > + /*
> > + * If class > max && class > cookie, it is the highest priority task on
> > + * the core (so far) and it must be selected, otherwise we must go with
> > + * the cookie pick in order to satisfy the constraint.
> > + */
> > + if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, class_pick))
> > + return class_pick;
> > +
> > + return cookie_pick;
> > +}
> > +
> > +static struct task_struct *
> > +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > +{
> > + struct task_struct *next, *max = NULL;
> > + const struct sched_class *class;
> > + const struct cpumask *smt_mask;
> > + int i, j, cpu;
> > +
> > + if (!sched_core_enabled(rq))
> > + return __pick_next_task(rq, prev, rf);
> > +
> > + /*
> > + * If there were no {en,de}queues since we picked (IOW, the task
> > + * pointers are all still valid), and we haven't scheduled the last
> > + * pick yet, do so now.
> > + */
> > + if (rq->core->core_pick_seq == rq->core->core_task_seq &&
> > + rq->core->core_pick_seq != rq->core_sched_seq) {
> > + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
> > +
> > + next = rq->core_pick;
> > + if (next != prev) {
> > + put_prev_task(rq, prev);
> > + set_next_task(rq, next);
> > + }
> > + return next;
> > + }
> > +
>
> The following patch improved my test cases.
> Welcome any comments.
>
This is certainly better than violating the point of the core scheduler :)
If I'm understanding this right what will happen in this case is instead
of using the idle process selected by the sibling we do the core scheduling
again. This may start with a newidle_balance which might bring over something
to run that matches what we want to put on the sibling. If that works then I
can see this helping.
But I'd be a little concerned that we could end up thrashing. Once we do core
scheduling again here we'd force the sibling to resched and if we got a different
result which "helped" him pick idle we'd go around again.
I think inherent in the concept of core scheduling (barring a perfectly aligned set
of jobs) is some extra idle time on siblings.
Cheers,
Phil
> Thanks,
> -Aubrey
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3e3162f..86031f4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3685,10 +3685,12 @@ pick_next_task(struct rq *rq, struct
> task_struct *prev, struct rq_flags *rf)
> /*
> * If there were no {en,de}queues since we picked (IOW, the task
> * pointers are all still valid), and we haven't scheduled the last
> - * pick yet, do so now.
> + * pick yet, do so now. If the last pick is idle task, we abandon
> + * last pick and try to pick up task this time.
> */
> if (rq->core->core_pick_seq == rq->core->core_task_seq &&
> - rq->core->core_pick_seq != rq->core_sched_seq) {
> + rq->core->core_pick_seq != rq->core_sched_seq &&
> + !is_idle_task(rq->core_pick)) {
> WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
>
> next = rq->core_pick;
--
> > The following patch improved my test cases.
> > Welcome any comments.
> >
>
> This is certainly better than violating the point of the core scheduler :)
>
> If I'm understanding this right what will happen in this case is instead
> of using the idle process selected by the sibling we do the core scheduling
> again. This may start with a newidle_balance which might bring over something
> to run that matches what we want to put on the sibling. If that works then I
> can see this helping.
>
> But I'd be a little concerned that we could end up thrashing. Once we do core
> scheduling again here we'd force the sibling to resched and if we got a different
> result which "helped" him pick idle we'd go around again.
>
> I think inherent in the concept of core scheduling (barring a perfectly aligned set
> of jobs) is some extra idle time on siblings.
>
I was also thinking along the same lines. This change basically always
tries to avoid idle and there by constantly interrupting the sibling.
While this change might benefit a very small subset of workloads, it
might introduce thrashing more often.
One other reason you might be seeing performance improvement is
because of the bugs that caused both siblings to go idle even though
there are runnable and compatible threads in the queue. Most of the
issues are fixed based on all the feedback received in v2. We have a
github repo with the pre v3 changes here:
https://github.com/digitalocean/linux-coresched/tree/coresched
Please try this and see how it compares with the vanilla v2. I think its
time for a v3 now and we shall be posting it soon after some more
testing and benchmarking.
Thanks,
On Mon, May 20, 2019 at 10:04 PM Vineeth Pillai
<[email protected]> wrote:
>
> > > The following patch improved my test cases.
> > > Welcome any comments.
> > >
> >
> > This is certainly better than violating the point of the core scheduler :)
> >
> > If I'm understanding this right what will happen in this case is instead
> > of using the idle process selected by the sibling we do the core scheduling
> > again. This may start with a newidle_balance which might bring over something
> > to run that matches what we want to put on the sibling. If that works then I
> > can see this helping.
> >
> > But I'd be a little concerned that we could end up thrashing. Once we do core
> > scheduling again here we'd force the sibling to resched and if we got a different
> > result which "helped" him pick idle we'd go around again.
Thrashing means more IPIs right? That's not what I observed, because idle task
has less chance onto CPU, rescheduling is reduced accordingly.
> > I think inherent in the concept of core scheduling (barring a perfectly aligned set
> > of jobs) is some extra idle time on siblings.
Yeah, I understand and agree with this, but 10-15% idle time on an overloaded
system makes me to try to figure out how this could happen and if we
can improve it.
> >
> >
> I was also thinking along the same lines. This change basically always
> tries to avoid idle and there by constantly interrupting the sibling.
> While this change might benefit a very small subset of workloads, it
> might introduce thrashing more often.
Thrashing is not observed under an overloaded case but may happen under a
light load or a mid load case, I need more investigation.
>
> One other reason you might be seeing performance improvement is
> because of the bugs that caused both siblings to go idle even though
> there are runnable and compatible threads in the queue. Most of the
> issues are fixed based on all the feedback received in v2. We have a
> github repo with the pre v3 changes here:
> https://github.com/digitalocean/linux-coresched/tree/coresched
Okay, thanks, it looks like the core functions pick_next_task() and pick_task()
have a lot of changes against v2. Need more brain power..
>
> Please try this and see how it compares with the vanilla v2. I think its
> time for a v3 now and we shall be posting it soon after some more
> testing and benchmarking.
Is there any potential change between pre v3 and v3? I prefer working
based on v3 so that everyone are on the same page.
Thanks,
-Aubrey
> > Please try this and see how it compares with the vanilla v2. I think its
> > time for a v3 now and we shall be posting it soon after some more
> > testing and benchmarking.
>
> Is there any potential change between pre v3 and v3? I prefer working
> based on v3 so that everyone are on the same page.
>
Makes sense, testing can wait until v3 is posted. I don't expect many
changes from above, but its better to test on the posted v3.
Thanks,
> > I do not have a strong opinion on both. Probably a better approach
> > would be to replace both cpu_prio_less/core_prio_less with prio_less
> > which takes the third arguement 'bool on_same_rq'?
> >
>
> Fwiw, I find the two names easier to read than a boolean flag. Could still
> be wrapped to a single implementation I suppose.
>
> An enum to control cpu or core would be more readable, but probably overkill...
>
I think we can infact remove the boolean altogether and still have a single
function to compare the priority. If tasks are on the same cpu, use the task's
vruntime, else do the normalization.
Thanks,
Vineeth
---
-static inline bool __prio_less(struct task_struct *a, struct task_struct *b, bool core_cmp)
+static inline bool prio_less(struct task_struct *a, struct task_struct *b)
{
- u64 vruntime;
int pa = __task_prio(a), pb = __task_prio(b);
@@ -119,25 +105,21 @@ static inline bool __prio_less(struct task_struct *a, struct task_struct *b, boo
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
- vruntime = b->se.vruntime;
- if (core_cmp) {
- vruntime -= task_cfs_rq(b)->min_vruntime;
- vruntime += task_cfs_rq(a)->min_vruntime;
- }
- if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
- return !((s64)(a->se.vruntime - vruntime) <= 0);
+ if (pa == MAX_RT_PRIO + MAX_NICE) { /* fair */
+ u64 vruntime = b->se.vruntime;
- return false;
-}
+ /*
+ * Normalize the vruntime if tasks are in different cpus.
+ */
+ if (task_cpu(a) != task_cpu(b)) {
+ vruntime -= task_cfs_rq(b)->min_vruntime;
+ vruntime += task_cfs_rq(a)->min_vruntime;
+ }
-static inline bool cpu_prio_less(struct task_struct *a, struct task_struct *b)
-{
- return __prio_less(a, b, false);
-}
+ return !((s64)(a->se.vruntime - vruntime) <= 0);
+ }
-static inline bool core_prio_less(struct task_struct *a, struct task_struct *b)
-{
- return __prio_less(a, b, true);
+ return false;
}
static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
@@ -149,7 +131,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (cpu_prio_less(b, a))
+ if (prio_less(b, a))
return true;
return false;
@@ -3621,7 +3603,7 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
* higher priority than max.
*/
if (max && class_pick->core_cookie &&
- core_prio_less(class_pick, max))
+ prio_less(class_pick, max))
return idle_sched_class.pick_task(rq);
return class_pick;
@@ -3640,8 +3622,8 @@ pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *ma
* the core (so far) and it must be selected, otherwise we must go with
* the cookie pick in order to satisfy the constraint.
*/
- if (cpu_prio_less(cookie_pick, class_pick) &&
- (!max || core_prio_less(max, class_pick)))
+ if (prio_less(cookie_pick, class_pick) &&
+ (!max || prio_less(max, class_pick)))
return class_pick;
return cookie_pick;