Linus,
Please pull the latest sched-fixes-for-linus git tree from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus
Thanks,
Ingo
------------------>
Li Zefan (1):
sched: partly revert "sched debug: remove NULL checking in print_cfs_rt_rq()"
Rusty Russell (1):
cpumask: fix CONFIG_NUMA=y sched.c
Steven Noonan (1):
kernel/sched.c: add missing forward declaration for 'double_rq_lock'
kernel/sched.c | 13 ++++++++-----
kernel/sched_debug.c | 21 +++++++++++++++++----
2 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index deb5ac8..8be2c13 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch);
DEFINE_TRACE(sched_migrate_task);
#ifdef CONFIG_SMP
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
@@ -7282,10 +7285,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
* groups, so roll our own. Now each node has its own list of groups which
* gets dynamically allocated.
*/
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
static struct sched_group ***sched_group_nodes_bycpu;
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@ -7560,7 +7563,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
#ifdef CONFIG_NUMA
if (cpumask_weight(cpu_map) >
SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
- sd = &per_cpu(allnodes_domains, i);
+ sd = &per_cpu(allnodes_domains, i).sd;
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), cpu_map);
@@ -7570,7 +7573,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
} else
p = NULL;
- sd = &per_cpu(node_domains, i);
+ sd = &per_cpu(node_domains, i).sd;
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
@@ -7688,7 +7691,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
for_each_cpu(j, nodemask) {
struct sched_domain *sd;
- sd = &per_cpu(node_domains, j);
+ sd = &per_cpu(node_domains, j).sd;
sd->groups = sg;
}
sg->__cpu_power = 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4293cfa..16eeba4 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_unlock_irqrestore(&tasklist_lock, flags);
}
+#if defined(CONFIG_CGROUP_SCHED) && \
+ (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
+static void task_group_path(struct task_group *tg, char *buf, int buflen)
+{
+ /* may be NULL if the underlying cgroup isn't fully-created yet */
+ if (!tg->css.cgroup) {
+ buf[0] = '\0';
+ return;
+ }
+ cgroup_path(tg->css.cgroup, buf, buflen);
+}
+#endif
+
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
unsigned long flags;
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
- char path[128] = "";
+ char path[128];
struct task_group *tg = cfs_rq->tg;
- cgroup_path(tg->css.cgroup, path, sizeof(path));
+ task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
@@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
- char path[128] = "";
+ char path[128];
struct task_group *tg = rt_rq->tg;
- cgroup_path(tg->css.cgroup, path, sizeof(path));
+ task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
#else
On Sun, 11 Jan 2009 15:43:05 +0100
Ingo Molnar <[email protected]> wrote:
> Please pull the latest sched-fixes-for-linus git tree
In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
identified what appears to be a sched-related performance regression.
A fairly long-term one - post-2.6.18, perhaps.
Testcase code has been added today. Could someone please take a look
sometime?
On Wed, 2009-01-14 at 12:15 -0800, Andrew Morton wrote:
> On Sun, 11 Jan 2009 15:43:05 +0100
> Ingo Molnar <[email protected]> wrote:
>
> > Please pull the latest sched-fixes-for-linus git tree
>
> In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
> identified what appears to be a sched-related performance regression.
> A fairly long-term one - post-2.6.18, perhaps.
>
> Testcase code has been added today. Could someone please take a look
> sometime?
There appear to be two different bug reports in there. One about iowait,
and one I'm not quite sure what it is about.
The second thing shows some numbers and a test case, but I fail to see
what the problem is with it.
On Wed, 14 Jan 2009 21:24:07 +0100 Peter Zijlstra <[email protected]> wrote:
> On Wed, 2009-01-14 at 12:15 -0800, Andrew Morton wrote:
> > On Sun, 11 Jan 2009 15:43:05 +0100
> > Ingo Molnar <[email protected]> wrote:
> >
> > > Please pull the latest sched-fixes-for-linus git tree
> >
> > In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
> > identified what appears to be a sched-related performance regression.
> > A fairly long-term one - post-2.6.18, perhaps.
> >
> > Testcase code has been added today. Could someone please take a look
> > sometime?
>
> There appear to be two different bug reports in there. One about iowait,
> and one I'm not quite sure what it is about.
>
> The second thing shows some numbers and a test case, but I fail to see
> what the problem is with it.
I had no problem seeing the problem: a gigantic performance regression
in two CPU-scheduler intensive workloads.
I can see some other problems, too.
On Fri, 2009-01-16 at 20:40 -0800, Andrew Morton wrote:
> On Wed, 14 Jan 2009 21:24:07 +0100 Peter Zijlstra <[email protected]> wrote:
>
> > On Wed, 2009-01-14 at 12:15 -0800, Andrew Morton wrote:
> > > On Sun, 11 Jan 2009 15:43:05 +0100
> > > Ingo Molnar <[email protected]> wrote:
> > >
> > > > Please pull the latest sched-fixes-for-linus git tree
> > >
> > > In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
> > > identified what appears to be a sched-related performance regression.
> > > A fairly long-term one - post-2.6.18, perhaps.
> > >
> > > Testcase code has been added today. Could someone please take a look
> > > sometime?
> >
> > There appear to be two different bug reports in there. One about iowait,
> > and one I'm not quite sure what it is about.
> >
> > The second thing shows some numbers and a test case, but I fail to see
> > what the problem is with it.
>
> I had no problem seeing the problem: a gigantic performance regression
> in two CPU-scheduler intensive workloads.
I can reproduce a very bad latency hit, investigating.
-Mike
On Fri, 2009-01-16 at 20:40 -0800, Andrew Morton wrote:
> On Wed, 14 Jan 2009 21:24:07 +0100 Peter Zijlstra <[email protected]> wrote:
>
> > On Wed, 2009-01-14 at 12:15 -0800, Andrew Morton wrote:
> > > On Sun, 11 Jan 2009 15:43:05 +0100
> > > Ingo Molnar <[email protected]> wrote:
> > >
> > > > Please pull the latest sched-fixes-for-linus git tree
> > >
> > > In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
> > > identified what appears to be a sched-related performance regression.
> > > A fairly long-term one - post-2.6.18, perhaps.
> > >
> > > Testcase code has been added today. Could someone please take a look
> > > sometime?
> >
> > There appear to be two different bug reports in there. One about iowait,
> > and one I'm not quite sure what it is about.
> >
> > The second thing shows some numbers and a test case, but I fail to see
> > what the problem is with it.
>
> I had no problem seeing the problem: a gigantic performance regression
> in two CPU-scheduler intensive workloads.
Right, but it wasn't clearly stated what the issue was, and there seem
to be two with that testcase
1) the total running time increases
2) it causes some severe latency spikes
Now why it was stuck in with that iowait thing I have no clue, as it
doesn't do anything with iowait, I ran the thing (well, both things) and
iowait is 0%.
Of course we'll try and fix these issues -- but as far as I'm concerned
its utterly unrelated to the iowait thing.
> I can see some other problems, too.
The universe is full of problems, mixing them all up doesn't get us
anywhere.
On Sat, 2009-01-17 at 07:29 +0100, Mike Galbraith wrote:
> On Fri, 2009-01-16 at 20:40 -0800, Andrew Morton wrote:
> > On Wed, 14 Jan 2009 21:24:07 +0100 Peter Zijlstra <[email protected]> wrote:
> >
> > > On Wed, 2009-01-14 at 12:15 -0800, Andrew Morton wrote:
> > > > On Sun, 11 Jan 2009 15:43:05 +0100
> > > > Ingo Molnar <[email protected]> wrote:
> > > >
> > > > > Please pull the latest sched-fixes-for-linus git tree
> > > >
> > > > In http://bugzilla.kernel.org/show_bug.cgi?id=12309 the reporters have
> > > > identified what appears to be a sched-related performance regression.
> > > > A fairly long-term one - post-2.6.18, perhaps.
> > > >
> > > > Testcase code has been added today. Could someone please take a look
> > > > sometime?
> > >
> > > There appear to be two different bug reports in there. One about iowait,
> > > and one I'm not quite sure what it is about.
> > >
> > > The second thing shows some numbers and a test case, but I fail to see
> > > what the problem is with it.
> >
> > I had no problem seeing the problem: a gigantic performance regression
> > in two CPU-scheduler intensive workloads.
>
> I can reproduce a very bad latency hit, investigating.
Dunno about the IO bits, but..
The problem with the C++ testcases seems to be wake_up_all() plunking a
genuine thundering herd onto runqueues. The sleeper fairness logic
places the entire herd left of min_vruntime, meaning N*sched_latency
pain for the poor sods who are setting the runqueue pace.
You _could_ scale according to runqueue load, but that has a globally
negative effect on sleeper fairness, important for many loads. The
below is my diagnostic machete in action looking for an alternative.
Notice how I carefully removed lkml before showing this masterpiece.
Oh, what the heck, I much prefer offline for diagnostic tinkerings, but
full disclosure and all that... delicate tummies beware!
This doesn't make interactivity wonderful of course, fair schedulers
don't do wonderful under heavy load, but this did prevent seizure.
(it's likely somewhat broken on top of being butt ugly.. diag hack;)
Suggestions welcome.
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ef609f8..e20e557 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -143,6 +143,12 @@ int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned);
int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned);
wait_queue_head_t *bit_waitqueue(void *, int);
+#define WAKE_SYNC 1
+#define WAKE_FAIR 2
+#define WAKE_BUDDY 4
+#define WAKE_PREEMPT 8
+#define WAKE_NORMAL (WAKE_FAIR | WAKE_BUDDY | WAKE_PREEMPT)
+
#define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c..ee52d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2258,13 +2258,13 @@ static int sched_balance_self(int cpu, int flag)
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
- int cpu, orig_cpu, this_cpu, success = 0;
+ int cpu, orig_cpu, this_cpu, success = 0, herd = sync & WAKE_FAIR;
unsigned long flags;
long old_state;
struct rq *rq;
if (!sched_feat(SYNC_WAKEUPS))
- sync = 0;
+ sync &= ~WAKE_SYNC;
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE)) {
@@ -2334,7 +2334,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
out_activate:
#endif /* CONFIG_SMP */
schedstat_inc(p, se.nr_wakeups);
- if (sync)
+ if (sync & WAKE_SYNC)
schedstat_inc(p, se.nr_wakeups_sync);
if (orig_cpu != cpu)
schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2342,7 +2342,7 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
- activate_task(rq, p, 1);
+ activate_task(rq, p, 1 + !herd);
success = 1;
out_running:
@@ -4692,12 +4692,19 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
{
wait_queue_t *curr, *next;
+
+ if (!nr_exclusive)
+ sync &= ~(WAKE_FAIR | WAKE_BUDDY);
+
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, sync, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
+
+ /* Only one preemptive wakeup per herd. */
+ sync &= ~WAKE_PREEMPT;
}
}
@@ -4714,7 +4721,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode,
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __wake_up_common(q, mode, nr_exclusive, 0, key);
+ __wake_up_common(q, mode, nr_exclusive, WAKE_NORMAL, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(__wake_up);
@@ -4724,7 +4731,7 @@ EXPORT_SYMBOL(__wake_up);
*/
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
{
- __wake_up_common(q, mode, 1, 0, NULL);
+ __wake_up_common(q, mode, 1, WAKE_NORMAL, NULL);
}
/**
@@ -4744,13 +4751,13 @@ void
__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
- int sync = 1;
+ int sync = WAKE_NORMAL | WAKE_SYNC;
if (unlikely(!q))
return;
if (unlikely(!nr_exclusive))
- sync = 0;
+ sync &= ~WAKE_SYNC;
spin_lock_irqsave(&q->lock, flags);
__wake_up_common(q, mode, nr_exclusive, sync, NULL);
@@ -4773,7 +4780,7 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
x->done++;
- __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+ __wake_up_common(&x->wait, TASK_NORMAL, 1, WAKE_NORMAL, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);
@@ -4790,7 +4797,7 @@ void complete_all(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
- __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+ __wake_up_common(&x->wait, TASK_NORMAL, 0, WAKE_NORMAL, NULL);
spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete_all);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c16..3e40d7e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -661,7 +661,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
u64 vruntime = cfs_rq->min_vruntime;
@@ -671,12 +671,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
- if (initial && sched_feat(START_DEBIT))
+ if (!wakeup && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
- if (!initial) {
+ if (wakeup) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ if (sched_feat(NEW_FAIR_SLEEPERS && wakeup == 1)) {
unsigned long thresh = sysctl_sched_latency;
/*
@@ -709,7 +709,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
account_entity_enqueue(cfs_rq, se);
if (wakeup) {
- place_entity(cfs_rq, se, 0);
+ place_entity(cfs_rq, se, wakeup);
enqueue_sleeper(cfs_rq, se);
}
@@ -1189,6 +1189,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
+ /* Clear flags unused by this function. */
+ sync = sync & WAKE_SYNC;
+
if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
p->se.avg_overlap > sysctl_sched_migration_cost))
sync = 0;
@@ -1392,9 +1395,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
* Also, during early boot the idle thread is in the fair class, for
* obvious reasons its a bad idea to schedule back to the idle thread.
*/
- if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle) &&
+ sync & WAKE_BUDDY)
set_last_buddy(se);
- set_next_buddy(pse);
+ if (sync & WAKE_BUDDY)
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1416,10 +1421,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
return;
}
- if (!sched_feat(WAKEUP_PREEMPT))
+ if (!sched_feat(WAKEUP_PREEMPT) || !sync & WAKE_PREEMPT)
return;
- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+ if (sched_feat(WAKEUP_OVERLAP) && (sync & WAKE_SYNC||
(se->avg_overlap < sysctl_sched_migration_cost &&
pse->avg_overlap < sysctl_sched_migration_cost))) {
resched_task(curr);
@@ -1650,7 +1655,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
sched_info_queued(p);
update_curr(cfs_rq);
- place_entity(cfs_rq, se, 1);
+ place_entity(cfs_rq, se, 0);
/* 'curr' will be NULL if the child belongs to a different group */
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
@@ -1721,7 +1726,7 @@ static void moved_group_fair(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_curr(cfs_rq);
- place_entity(cfs_rq, &p->se, 1);
+ place_entity(cfs_rq, &p->se, 0);
}
#endif
On Sat, 2009-01-17 at 10:54 +0100, Mike Galbraith wrote:
Same comments as before..
> The problem with the C++ testcases seems to be wake_up_all() plunking a
> genuine thundering herd onto runqueues.
I'm still trying to find what wake_up_all() is being hit... Both
test-cases seem to create ping-pong threads/processes (lots of them
though).
> The sleeper fairness logic
> places the entire herd left of min_vruntime, meaning N*sched_latency
> pain for the poor sods who are setting the runqueue pace.
Right, how about we flip the 'initial' case in place_entity() for !
nr_exclusive wakeups.
> This doesn't make interactivity wonderful of course, fair schedulers
> don't do wonderful under heavy load, but this did prevent seizure.
That's not inherently to fair schedulers, there's only so much you can
do when there's 50 processes contending for time.
> (it's likely somewhat broken on top of being butt ugly.. diag hack;)
>
> Suggestions welcome.
>
> diff --git a/include/linux/wait.h b/include/linux/wait.h
> index ef609f8..e20e557 100644
> --- a/include/linux/wait.h
> +++ b/include/linux/wait.h
> @@ -143,6 +143,12 @@ int out_of_line_wait_on_bit(void *, int, int (*)(void *), unsigned);
> int out_of_line_wait_on_bit_lock(void *, int, int (*)(void *), unsigned);
> wait_queue_head_t *bit_waitqueue(void *, int);
>
> +#define WAKE_SYNC 1
The regular sync hint of affine me harder
> +#define WAKE_FAIR 2
sleeper fairness for this wakeup
> +#define WAKE_BUDDY 4
set buddies for this wakeup
> +#define WAKE_PREEMPT 8
Do wakeup preemption for this wakeup.
> +#define WAKE_NORMAL (WAKE_FAIR | WAKE_BUDDY | WAKE_PREEMPT)
> +
> #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL)
> #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL)
> #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 52bbf1c..ee52d89 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -2258,13 +2258,13 @@ static int sched_balance_self(int cpu, int flag)
> */
> static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
> {
> - int cpu, orig_cpu, this_cpu, success = 0;
> + int cpu, orig_cpu, this_cpu, success = 0, herd = sync & WAKE_FAIR;
> unsigned long flags;
> long old_state;
> struct rq *rq;
>
> if (!sched_feat(SYNC_WAKEUPS))
> - sync = 0;
> + sync &= ~WAKE_SYNC;
>
> #ifdef CONFIG_SMP
> if (sched_feat(LB_WAKEUP_UPDATE)) {
> @@ -2334,7 +2334,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
> out_activate:
> #endif /* CONFIG_SMP */
> schedstat_inc(p, se.nr_wakeups);
> - if (sync)
> + if (sync & WAKE_SYNC)
> schedstat_inc(p, se.nr_wakeups_sync);
> if (orig_cpu != cpu)
> schedstat_inc(p, se.nr_wakeups_migrate);
> @@ -2342,7 +2342,7 @@ out_activate:
> schedstat_inc(p, se.nr_wakeups_local);
> else
> schedstat_inc(p, se.nr_wakeups_remote);
> - activate_task(rq, p, 1);
> + activate_task(rq, p, 1 + !herd);
Right, so how about 1 - !herd ?
> success = 1;
>
> out_running:
> @@ -4692,12 +4692,19 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
> {
> wait_queue_t *curr, *next;
>
> +
> + if (!nr_exclusive)
> + sync &= ~(WAKE_FAIR | WAKE_BUDDY);
Right, disable buddies and sleeper fairness for mass wakeups
> list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
> unsigned flags = curr->flags;
>
> if (curr->func(curr, mode, sync, key) &&
> (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
> break;
> +
> + /* Only one preemptive wakeup per herd. */
> + sync &= ~WAKE_PREEMPT;
and only let the first wakeup do a preemption.
> }
> }
>
> @@ -4714,7 +4721,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode,
> unsigned long flags;
>
> spin_lock_irqsave(&q->lock, flags);
> - __wake_up_common(q, mode, nr_exclusive, 0, key);
> + __wake_up_common(q, mode, nr_exclusive, WAKE_NORMAL, key);
> spin_unlock_irqrestore(&q->lock, flags);
> }
> EXPORT_SYMBOL(__wake_up);
> @@ -4724,7 +4731,7 @@ EXPORT_SYMBOL(__wake_up);
> */
> void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
> {
> - __wake_up_common(q, mode, 1, 0, NULL);
> + __wake_up_common(q, mode, 1, WAKE_NORMAL, NULL);
> }
>
> /**
> @@ -4744,13 +4751,13 @@ void
> __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
> {
> unsigned long flags;
> - int sync = 1;
> + int sync = WAKE_NORMAL | WAKE_SYNC;
>
> if (unlikely(!q))
> return;
>
> if (unlikely(!nr_exclusive))
> - sync = 0;
> + sync &= ~WAKE_SYNC;
>
> spin_lock_irqsave(&q->lock, flags);
> __wake_up_common(q, mode, nr_exclusive, sync, NULL);
> @@ -4773,7 +4780,7 @@ void complete(struct completion *x)
>
> spin_lock_irqsave(&x->wait.lock, flags);
> x->done++;
> - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
> + __wake_up_common(&x->wait, TASK_NORMAL, 1, WAKE_NORMAL, NULL);
> spin_unlock_irqrestore(&x->wait.lock, flags);
> }
> EXPORT_SYMBOL(complete);
> @@ -4790,7 +4797,7 @@ void complete_all(struct completion *x)
>
> spin_lock_irqsave(&x->wait.lock, flags);
> x->done += UINT_MAX/2;
> - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
> + __wake_up_common(&x->wait, TASK_NORMAL, 0, WAKE_NORMAL, NULL);
> spin_unlock_irqrestore(&x->wait.lock, flags);
> }
> EXPORT_SYMBOL(complete_all);
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index 5cc1c16..3e40d7e 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -661,7 +661,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
> }
>
> static void
> -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
> {
> u64 vruntime = cfs_rq->min_vruntime;
>
> @@ -671,12 +671,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> * little, place the new task so that it fits in the slot that
> * stays open at the end.
> */
> - if (initial && sched_feat(START_DEBIT))
> + if (!wakeup && sched_feat(START_DEBIT))
> vruntime += sched_vslice(cfs_rq, se);
>
> - if (!initial) {
> + if (wakeup) {
> /* sleeps upto a single latency don't count. */
> - if (sched_feat(NEW_FAIR_SLEEPERS)) {
> + if (sched_feat(NEW_FAIR_SLEEPERS && wakeup == 1)) {
That doesn't look like it fancies compiling though ;-)
> unsigned long thresh = sysctl_sched_latency;
>
> /*
> @@ -709,7 +709,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
> account_entity_enqueue(cfs_rq, se);
>
> if (wakeup) {
> - place_entity(cfs_rq, se, 0);
> + place_entity(cfs_rq, se, wakeup);
> enqueue_sleeper(cfs_rq, se);
> }
>
> @@ -1189,6 +1189,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
> if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
> return 0;
>
> + /* Clear flags unused by this function. */
> + sync = sync & WAKE_SYNC;
> +
> if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
> p->se.avg_overlap > sysctl_sched_migration_cost))
> sync = 0;
> @@ -1392,9 +1395,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
> * Also, during early boot the idle thread is in the fair class, for
> * obvious reasons its a bad idea to schedule back to the idle thread.
> */
> - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
> + if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle) &&
> + sync & WAKE_BUDDY)
> set_last_buddy(se);
> - set_next_buddy(pse);
> + if (sync & WAKE_BUDDY)
> + set_next_buddy(pse);
>
> /*
> * We can come here with TIF_NEED_RESCHED already set from new task
> @@ -1416,10 +1421,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
> return;
> }
>
> - if (!sched_feat(WAKEUP_PREEMPT))
> + if (!sched_feat(WAKEUP_PREEMPT) || !sync & WAKE_PREEMPT)
C operator precendence suggests you write !(sync & WAKE_PREEMPT), unless
you're attempting to write 0 in a complicated way.
> return;
>
> - if (sched_feat(WAKEUP_OVERLAP) && (sync ||
> + if (sched_feat(WAKEUP_OVERLAP) && (sync & WAKE_SYNC||
> (se->avg_overlap < sysctl_sched_migration_cost &&
> pse->avg_overlap < sysctl_sched_migration_cost))) {
> resched_task(curr);
> @@ -1650,7 +1655,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
> sched_info_queued(p);
>
> update_curr(cfs_rq);
> - place_entity(cfs_rq, se, 1);
> + place_entity(cfs_rq, se, 0);
>
> /* 'curr' will be NULL if the child belongs to a different group */
> if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
> @@ -1721,7 +1726,7 @@ static void moved_group_fair(struct task_struct *p)
> struct cfs_rq *cfs_rq = task_cfs_rq(p);
>
> update_curr(cfs_rq);
> - place_entity(cfs_rq, &p->se, 1);
> + place_entity(cfs_rq, &p->se, 0);
> }
> #endif
On Sat, 2009-01-17 at 11:07 +0100, Peter Zijlstra wrote:
> On Sat, 2009-01-17 at 10:54 +0100, Mike Galbraith wrote:
>
> Same comments as before..
>
> > The problem with the C++ testcases seems to be wake_up_all() plunking a
> > genuine thundering herd onto runqueues.
>
> I'm still trying to find what wake_up_all() is being hit... Both
> test-cases seem to create ping-pong threads/processes (lots of them
> though).
I logic leaped there from the sched_debug data. Maybe bugs only made it
appear that leap was correct.
> > The sleeper fairness logic
> > places the entire herd left of min_vruntime, meaning N*sched_latency
> > pain for the poor sods who are setting the runqueue pace.
>
> Right, how about we flip the 'initial' case in place_entity() for !
> nr_exclusive wakeups.
Wouldn't that be more drastic than sleep denial?
> schedstat_inc(p, se.nr_wakeups_migrate);
> > @@ -2342,7 +2342,7 @@ out_activate:
> > schedstat_inc(p, se.nr_wakeups_local);
> > else
> > schedstat_inc(p, se.nr_wakeups_remote);
> > - activate_task(rq, p, 1);
> > + activate_task(rq, p, 1 + !herd);
>
> Right, so how about 1 - !herd ?
Heh, I shouldn't start tinkering so early.
> > @@ -671,12 +671,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> > * little, place the new task so that it fits in the slot that
> > * stays open at the end.
> > */
> > - if (initial && sched_feat(START_DEBIT))
> > + if (!wakeup && sched_feat(START_DEBIT))
> > vruntime += sched_vslice(cfs_rq, se);
> >
> > - if (!initial) {
> > + if (wakeup) {
> > /* sleeps upto a single latency don't count. */
> > - if (sched_feat(NEW_FAIR_SLEEPERS)) {
> > + if (sched_feat(NEW_FAIR_SLEEPERS && wakeup == 1)) {
>
> That doesn't look like it fancies compiling though ;-)
Is booboo, but...
marge:..kernel/linux-2.6.29.git # make
CHK include/linux/version.h
CHK include/linux/utsrelease.h
SYMLINK include/asm -> include/asm-x86
CALL scripts/checksyscalls.sh
CHK include/linux/compile.h
CC kernel/sched.o
LD kernel/built-in.o
LD vmlinux.o
MODPOST vmlinux.o
GEN .version
CHK include/linux/compile.h
UPD include/linux/compile.h
CC init/version.o
LD init/built-in.o
LD .tmp_vmlinux1
KSYM .tmp_kallsyms1.S
AS .tmp_kallsyms1.o
LD .tmp_vmlinux2
KSYM .tmp_kallsyms2.S
AS .tmp_kallsyms2.o
LD vmlinux
> * We can come here with TIF_NEED_RESCHED already set from new task
> > @@ -1416,10 +1421,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
> > return;
> > }
> >
> > - if (!sched_feat(WAKEUP_PREEMPT))
> > + if (!sched_feat(WAKEUP_PREEMPT) || !sync & WAKE_PREEMPT)
>
> C operator precendence suggests you write !(sync & WAKE_PREEMPT), unless
> you're attempting to write 0 in a complicated way.
Nope, booboo.
-Mike
On Sat, 2009-01-17 at 11:34 +0100, Mike Galbraith wrote:
> > Right, how about we flip the 'initial' case in place_entity() for !
> > nr_exclusive wakeups.
>
> Wouldn't that be more drastic than sleep denial?
Strictly speaking that DEBIT thing is valid for each wakeup, IIRC we
restricted it to clone() only because that was where we could actually
observe these latency spikes using a fork-bomb.
This reduces the latency hits to around 400ms, which is about right for
the given load.
---
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f34cf42..d344605 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -671,7 +671,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
- if (initial && sched_feat(START_DEBIT))
+ if ((initial && sched_feat(START_DEBIT)) ||
+ (!initial && sched_feat(WAKER_DEBIT)))
vruntime += sched_vslice(cfs_rq, se);
if (!initial) {
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa..b5fddf0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
SCHED_FEAT(NORMALIZED_SLEEPER, 0)
SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
@@ -15,3 +15,4 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
SCHED_FEAT(LAST_BUDDY, 1)
SCHED_FEAT(OWNER_SPIN, 1)
+SCHED_FEAT(WAKER_DEBIT, 1)
On Sat, 2009-01-17 at 13:00 +0100, Peter Zijlstra wrote:
> On Sat, 2009-01-17 at 11:34 +0100, Mike Galbraith wrote:
> > > Right, how about we flip the 'initial' case in place_entity() for !
> > > nr_exclusive wakeups.
> >
> > Wouldn't that be more drastic than sleep denial?
>
> Strictly speaking that DEBIT thing is valid for each wakeup, IIRC we
> restricted it to clone() only because that was where we could actually
> observe these latency spikes using a fork-bomb.
>
> This reduces the latency hits to around 400ms, which is about right for
> the given load.
>
> ---
> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index f34cf42..d344605 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -671,7 +671,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
> * little, place the new task so that it fits in the slot that
> * stays open at the end.
> */
> - if (initial && sched_feat(START_DEBIT))
> + if ((initial && sched_feat(START_DEBIT)) ||
> + (!initial && sched_feat(WAKER_DEBIT)))
> vruntime += sched_vslice(cfs_rq, se);
>
> if (!initial) {
> diff --git a/kernel/sched_features.h b/kernel/sched_features.h
> index 4569bfa..b5fddf0 100644
> --- a/kernel/sched_features.h
> +++ b/kernel/sched_features.h
> @@ -1,4 +1,4 @@
> -SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
> +SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
> SCHED_FEAT(NORMALIZED_SLEEPER, 0)
> SCHED_FEAT(ADAPTIVE_GRAN, 1)
> SCHED_FEAT(WAKEUP_PREEMPT, 1)
> @@ -15,3 +15,4 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
> SCHED_FEAT(WAKEUP_OVERLAP, 0)
> SCHED_FEAT(LAST_BUDDY, 1)
> SCHED_FEAT(OWNER_SPIN, 1)
> +SCHED_FEAT(WAKER_DEBIT, 1)
Ouch, that doesn't look like it will be nice at all for interactivity.
(which btw was terrible with my hackery, even with booboos fixed)
-Mike
http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
scheduler regression. It has been bisected.
On Sat, 2009-01-17 at 13:19 +0100, Mike Galbraith wrote:
> Ouch, that doesn't look like it will be nice at all for interactivity.
Actually, it's not as bad as expected, merely lethargic slug.
-Mike
On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> scheduler regression. It has been bisected.
Seems pretty clear. I'd suggest reverting it.
-Mike
On Sat, 2009-01-17 at 13:59 +0100, Mike Galbraith wrote:
> On Sat, 2009-01-17 at 13:19 +0100, Mike Galbraith wrote:
>
> > Ouch, that doesn't look like it will be nice at all for interactivity.
>
> Actually, it's not as bad as expected, merely lethargic slug.
Right, just ran the thing on my laptop and sluggish but 'usable' at load
>100 seems perfectly good to me.
* Mike Galbraith <[email protected]> wrote:
> On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> > http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> > scheduler regression. It has been bisected.
>
> Seems pretty clear. I'd suggest reverting it.
We can revert it (and will revert it if no solution is found), but i'd
also like to understand why it happens, because that kind of regression
from this change is unexpected - we might be hiding some bug that could
pop up under less debuggable circumstances, so we need to understand it
while we have a chance.
Below is the commit in question. Avi, any ideas what makes KVM special
here? Perhaps its use of "preempt notifiers" is causing a problem somehow?
Ingo
-------------->
>From 14800984706bf6936bbec5187f736e928be5c218 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <[email protected]>
Date: Fri, 7 Nov 2008 15:26:50 +0100
Subject: [PATCH] sched: fine-tune SD_MC_INIT
Tune SD_MC_INIT the same way as SD_CPU_INIT:
unset SD_BALANCE_NEWIDLE, and set SD_WAKE_BALANCE.
This improves vmark by 5%:
vmark 132102 125968 125497 messages/sec avg 127855.66 .984
vmark 139404 131719 131272 messages/sec avg 134131.66 1.033
Signed-off-by: Mike Galbraith <[email protected]>
Acked-by: Peter Zijlstra <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
# *DOCUMENTATION*
---
include/linux/topology.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 34a7ee0..a8d8405 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -120,10 +120,10 @@ void arch_update_cpu_topology(void);
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
- | SD_BALANCE_NEWIDLE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
+ | SD_WAKE_BALANCE \
| SD_SHARE_PKG_RESOURCES\
| BALANCE_FOR_MC_POWER, \
.last_balance = jiffies, \
* Mike Galbraith <[email protected]> wrote:
> Dunno about the IO bits, but..
>
> The problem with the C++ testcases seems to be wake_up_all() plunking a
> genuine thundering herd onto runqueues. The sleeper fairness logic
> places the entire herd left of min_vruntime, meaning N*sched_latency
> pain for the poor sods who are setting the runqueue pace.
100 wakeup pairs that all run and ping-pong between each other?
That creates 200 tasks with an average system load of 100.0, on a
dual-core system. Is that a fair representation of some real workload, or
just an unrealistic "gee, look, given enough tasks running I can overload
the system _this bad_" example?
Ingo
On Sat, 2009-01-17 at 17:01 +0100, Ingo Molnar wrote:
> * Mike Galbraith <[email protected]> wrote:
>
> > On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> > > http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> > > scheduler regression. It has been bisected.
> >
> > Seems pretty clear. I'd suggest reverting it.
>
> We can revert it (and will revert it if no solution is found), but i'd
> also like to understand why it happens, because that kind of regression
> from this change is unexpected - we might be hiding some bug that could
> pop up under less debuggable circumstances, so we need to understand it
> while we have a chance.
Agree. However, with the sched_mc stuff, mysql+oltp now does better
with NEWIDLE on than off as well, as does an nfs kbuild.
-Mike
* Mike Galbraith <[email protected]> wrote:
> On Sat, 2009-01-17 at 17:01 +0100, Ingo Molnar wrote:
> > * Mike Galbraith <[email protected]> wrote:
> >
> > > On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> > > > http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> > > > scheduler regression. It has been bisected.
> > >
> > > Seems pretty clear. I'd suggest reverting it.
> >
> > We can revert it (and will revert it if no solution is found), but i'd
> > also like to understand why it happens, because that kind of
> > regression from this change is unexpected - we might be hiding some
> > bug that could pop up under less debuggable circumstances, so we need
> > to understand it while we have a chance.
>
> Agree. However, with the sched_mc stuff, mysql+oltp now does better
> with NEWIDLE on than off as well, as does an nfs kbuild.
Didnt you come up with the verdict that sched_mc=2 is not a win - or has
that changed? If we should change the defaults then please send a
re-tuning patch against the latest code.
Still, the ping delays of multiple seconds are completely unacceptable and
need to be understood, or these will come back and might bite us in a lot
less fortunate place. NEWIDLE and WAKE_BALANCE has micro-effects on
latencies - anything in the user-visible range is highly anomalous at
these low load levels.
Ingo
On Sat, 2009-01-17 at 17:12 +0100, Ingo Molnar wrote:
> * Mike Galbraith <[email protected]> wrote:
>
> > Dunno about the IO bits, but..
> >
> > The problem with the C++ testcases seems to be wake_up_all() plunking a
> > genuine thundering herd onto runqueues. The sleeper fairness logic
> > places the entire herd left of min_vruntime, meaning N*sched_latency
> > pain for the poor sods who are setting the runqueue pace.
>
> 100 wakeup pairs that all run and ping-pong between each other?
>
> That creates 200 tasks with an average system load of 100.0, on a
> dual-core system. Is that a fair representation of some real workload, or
> just an unrealistic "gee, look, given enough tasks running I can overload
> the system _this bad_" example?
Looks contrived to me, but it is a hole. Dang sleepers, can't live with
'em can't live without 'em.
-Mike
On Sat, 2009-01-17 at 17:25 +0100, Ingo Molnar wrote:
> * Mike Galbraith <[email protected]> wrote:
>
> > On Sat, 2009-01-17 at 17:01 +0100, Ingo Molnar wrote:
> > > * Mike Galbraith <[email protected]> wrote:
> > >
> > > > On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> > > > > http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> > > > > scheduler regression. It has been bisected.
> > > >
> > > > Seems pretty clear. I'd suggest reverting it.
> > >
> > > We can revert it (and will revert it if no solution is found), but i'd
> > > also like to understand why it happens, because that kind of
> > > regression from this change is unexpected - we might be hiding some
> > > bug that could pop up under less debuggable circumstances, so we need
> > > to understand it while we have a chance.
> >
> > Agree. However, with the sched_mc stuff, mysql+oltp now does better
> > with NEWIDLE on than off as well, as does an nfs kbuild.
>
> Didnt you come up with the verdict that sched_mc=2 is not a win - or has
> that changed? If we should change the defaults then please send a
> re-tuning patch against the latest code.
sched_mc=2 was better than sched_mc=1. The other balancing changes put
a dent in mysql+oltp peak and immediately after peak. Setting
sched_mc=2 brought back the loss that was otherwise there all the way
through the curve back to 28 level, so with sched_mc=2, there was only
the slight loss of peak, and larger loss of post-peak.
> Still, the ping delays of multiple seconds are completely unacceptable and
> need to be understood, or these will come back and might bite us in a lot
> less fortunate place. NEWIDLE and WAKE_BALANCE has micro-effects on
> latencies - anything in the user-visible range is highly anomalous at
> these low load levels.
>
> Ingo
On Sat, 2009-01-17 at 17:37 +0100, Mike Galbraith wrote:
> On Sat, 2009-01-17 at 17:25 +0100, Ingo Molnar wrote:
> > * Mike Galbraith <[email protected]> wrote:
> >
> > > On Sat, 2009-01-17 at 17:01 +0100, Ingo Molnar wrote:
> > > > * Mike Galbraith <[email protected]> wrote:
> > > >
> > > > > On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
> > > > > > http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
> > > > > > scheduler regression. It has been bisected.
> > > > >
> > > > > Seems pretty clear. I'd suggest reverting it.
> > > >
> > > > We can revert it (and will revert it if no solution is found), but i'd
> > > > also like to understand why it happens, because that kind of
> > > > regression from this change is unexpected - we might be hiding some
> > > > bug that could pop up under less debuggable circumstances, so we need
> > > > to understand it while we have a chance.
> > >
> > > Agree. However, with the sched_mc stuff, mysql+oltp now does better
> > > with NEWIDLE on than off as well, as does an nfs kbuild.
> >
> > Didnt you come up with the verdict that sched_mc=2 is not a win - or has
> > that changed? If we should change the defaults then please send a
> > re-tuning patch against the latest code.
>
> sched_mc=2 was better than sched_mc=1. The other balancing changes put
> a dent in mysql+oltp peak and immediately after peak. Setting
> sched_mc=2 brought back the loss that was otherwise there all the way
> through the curve back to 28 level, so with sched_mc=2, there was only
> the slight loss of peak, and larger loss of post-peak.
P.S.
/sched_mc=1/sched_mc=0
P.P.S.
I suggested setting sched_mc=2 to be default since we already have the
pain that this development inflicted, we may as well have the gain as
well, such that it's a trade-off in favor of the fork/exec load at the
expense of cache sensitive loads like mysql+oltp, but the authors prefer
to leave the default as is.
<quote>
> > Thanks for the detailed benchmark reports. Glad to hear that
> > sched_mc=2 is helping in most scenarios. Though we would be tempted to
> > make it default, I would still like to default to zero in order to
> > provide base line performance. I would expect end users to flip the
> > settings to sched_mc=2 if it helps their workload in terms of
> > performance and/or power savings.
>
> The mysql+oltp peak loss is there either way, but with 2, mid range
> throughput is ~28 baseline. High end (yawn) is better, and the nfs
> kbuild performs better than baseline.
>
> Baseline performance, at least wrt mysql+oltp doesn't seem to be an
> option. Not my call. More testing and more testers required I suppose.
Yes, more testing is definitely due. I'd like to hear from people
with larger and newer boxes as well before I would be comfortable making
sched_mc=2 as default.
</quote>
WRT these latencies, I'm of two minds.
On the one hand, those huge latencies are unacceptable, but on the
other, there are knobs in place for anyone running a load that really
does wake a zillion long sleeping threads simultaneously. No matter
_what_ amount of sleeper fairness we try to achieve, this latency
multiplier potential is there.
I see no really appetizing option. You can easily scale the bonus back
as load increases to limit the pain potential, but that has down side
potential too, and would likely not be a good default: for mysql+oltp,
wakeup preemption is a positive factor through the entire performance
curve, up until it is totally jammed up on itself. On my Q6600, the
preempt/no-preempt curves converge at 64 clients/core. On the
interactivity front, X and it's clients don't care how many hogs they're
competing against, any preempt ability we take translates directly into
interactivity loss for the user. Not to mention events.
The potential of wake_up_all() bothers me more than this starter pistol.
(why I leapt straight to it from debug output, I've been there before;)
-Mike
Ingo Molnar wrote:
> * Mike Galbraith <[email protected]> wrote:
>
>
>> On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
>>
>>> http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
>>> scheduler regression. It has been bisected.
>>>
>> Seems pretty clear. I'd suggest reverting it.
>>
>
> We can revert it (and will revert it if no solution is found), but i'd
> also like to understand why it happens, because that kind of regression
> from this change is unexpected - we might be hiding some bug that could
> pop up under less debuggable circumstances, so we need to understand it
> while we have a chance.
>
> Below is the commit in question. Avi, any ideas what makes KVM special
> here? Perhaps its use of "preempt notifiers" is causing a problem somehow?
>
preempt notifiers use should cause additional context switch costs of a
few thousand cycles and possible an IPI (if a vcpu was migrated). So
I'd suspect scheduling latency here.
Is it possible to trace this (the time between a wake up and actual
scheduling of a task)?
--
error compiling committee.c: too many arguments to function
* Avi Kivity <[email protected]> wrote:
> Ingo Molnar wrote:
>> * Mike Galbraith <[email protected]> wrote:
>>
>>
>>> On Sat, 2009-01-17 at 04:43 -0800, Andrew Morton wrote:
>>>
>>>> http://bugzilla.kernel.org/show_bug.cgi?id=12465 just popped up - another
>>>> scheduler regression. It has been bisected.
>>>>
>>> Seems pretty clear. I'd suggest reverting it.
>>>
>>
>> We can revert it (and will revert it if no solution is found), but i'd
>> also like to understand why it happens, because that kind of regression
>> from this change is unexpected - we might be hiding some bug that could
>> pop up under less debuggable circumstances, so we need to understand it
>> while we have a chance.
>>
>> Below is the commit in question. Avi, any ideas what makes KVM special
>> here? Perhaps its use of "preempt notifiers" is causing a problem
>> somehow?
>>
>
> preempt notifiers use should cause additional context switch costs of a
> few thousand cycles and possible an IPI (if a vcpu was migrated). So
> I'd suspect scheduling latency here.
>
> Is it possible to trace this (the time between a wake up and actual
> scheduling of a task)?
Can you reproduce those latencies? We didnt get similar reports from
elsewhere so there seems to be a KVM angle.
Ingo
Ingo Molnar wrote:
> Can you reproduce those latencies? We didnt get similar reports from
> elsewhere so there seems to be a KVM angle.
>
A simple test running 10 idle guests over 4 cores doesn't reproduce
(2.6.29-rc). However we likely need a more complicated setup.
Kevin, any details about your setup? What are the guests doing? Are
you overcommitting memory (is the host swapping)?
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
> Ingo Molnar wrote:
>> Can you reproduce those latencies? We didnt get similar reports from
>> elsewhere so there seems to be a KVM angle.
>>
>
> A simple test running 10 idle guests over 4 cores doesn't reproduce
> (2.6.29-rc). However we likely need a more complicated setup.
>
> Kevin, any details about your setup? What are the guests doing? Are
> you overcommitting memory (is the host swapping)?
>
If the host load is low enough, it may be worthwhile to repeat with
-no-kvm. It's significantly different from kvm (much higher cpu load,
and less tasks involved), but if the problem recurs, we know it's a pure
sched issue.
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
>
> A simple test running 10 idle guests over 4 cores doesn't reproduce
> (2.6.29-rc). However we likely need a more complicated setup.
>
Letting the ping run for a while, I see
> --- 10.35.18.172 ping statistics ---
> 846 packets transmitted, 846 received, 0% packet loss, time 845917ms
> rtt min/avg/max/mdev = 0.131/2.572/305.452/20.236 ms
So it does reproduce.
--
error compiling committee.c: too many arguments to function
On Sun, 2009-01-18 at 11:21 +0200, Avi Kivity wrote:
> If the host load is low enough, it may be worthwhile to repeat with
> -no-kvm. It's significantly different from kvm (much higher cpu load,
> and less tasks involved), but if the problem recurs, we know it's a pure
> sched issue.
Okay, I'll see how I go with this tonight.
Regards,
Kevin.
On Sun, 2009-01-18 at 10:59 +0200, Avi Kivity wrote:
> A simple test running 10 idle guests over 4 cores doesn't reproduce
> (2.6.29-rc). However we likely need a more complicated setup.
>
> Kevin, any details about your setup? What are the guests doing? Are
> you overcommitting memory (is the host swapping)?
Hi Avi,
No, I don't think there is any swapping. The host has 32GB RAM and only
~9GB of guest RAM has been requested in total. The host doesn't do
anything but run KVM.
I fear my test setup is quite complicated as it is a production system
and I haven't been able to reproduce a simpler case as yet. Anyway, here
are the guests I have running during the test:
Windows XP, 32-bit (UP) - There are twelve of these launched with
identical scripts, apart from differing amounts of RAM. Six with "-m
512", one with "-m 768" and five with "-m 256":
KVM=/usr/local/kvm/bin/qemu-system-x86_64
$KVM \
-no-acpi -localtime -m 512 \
-hda kvm-00-xp.img \
-net nic,vlan=0,macaddr=52:54:00:12:34:56,model=rtl8139 \
-net tap,vlan=0,ifname=tap0,script=no \
-vnc 127.0.0.1:0 -usbdevice tablet \
-daemonize
Windows 2003, 32-bit (UP):
KVM=/usr/local/kvm/bin/qemu-system-x86_64
$KVM \
-no-acpi \
-localtime -m 1024 \
-hda kvm-09-2003-C.img \
-hdb kvm-09-2003-E.img \
-net nic,vlan=0,macaddr=52:54:00:12:34:5F,model=rtl8139 \
-net tap,vlan=0,ifname=tap9,script=no \
-vnc 127.0.0.1:9 -usbdevice tablet \
-daemonize
Debian Lenny, 64-bit (MP):
KVM=/usr/local/kvm/bin/qemu-system-x86_64
$KVM \
-smp 2 \
-m 512 \
-hda kvm-10-lenny64.img \
-net nic,vlan=0,macaddr=52:54:00:12:34:60,model=rtl8139 \
-net tap,vlan=0,ifname=tap10,script=no \
-vnc 127.0.0.1:10 -usbdevice tablet \
-daemonize
Debian Lenny, 32-bit (MP):
KVM=/usr/local/kvm/bin/qemu-system-x86_64
$KVM \
-smp 2 \
-m 512 \
-hda kvm-15-etch-i386.img \
-net nic,vlan=0,macaddr=52:54:00:12:34:65,model=rtl8139 \
-net tap,vlan=0,ifname=tap15,script=no \
-vnc 127.0.0.1:15 -usbdevice tablet \
-daemonize
Debian Etch (using a kernel.org 2.6.27.10) 32-bit (MP):
KVM=/usr/local/kvm/bin/qemu-system-x86_64
$KVM \
-smp 2 \
-m 2048 \
-hda kvm-17-1.img \
-hdb kvm-17-tmp.img \
-net nic,vlan=0,macaddr=52:54:00:12:34:67,model=rtl8139 \
-net tap,vlan=0,ifname=tap17,script=no \
-vnc 127.0.0.1:17 -usbdevice tablet \
-daemonize
This last VM is the one I was pinging from the host.
I begin by launching the guests at 90 second intervals and then let them
settle for about 5 minutes. The final guest runs Apache with RT and a
postgresql database. I set my browser to refresh the main RT page every
2 minutes, to roughly simulate the common usage. With all the guests
basically idling and just the last guest servicing the RT requests, the
load average is about 0.25 or less. Then I just ping the last guest for
about 15 minutes.
One thing that may be of interest - a couple of times while running the
test I also had a terminal logged into the guest being pinged and
running "top". There was some crazy stuff showing up in there, with some
processes having e.g. 1200 in the "%CPU" column. I remember postmaster
(postgresql) being one of them. A quick scan over the top few tasks
might show the total "%CPU" adding up to 2000 or more. This tended to
happen at the same time I saw the latency spikes in ping as well. That
might just be an artifact of timing getting screwed up in "top", I
guess.
Regards,
Kevin.
Avi Kivity wrote:
> Avi Kivity wrote:
>>
>> A simple test running 10 idle guests over 4 cores doesn't reproduce
>> (2.6.29-rc). However we likely need a more complicated setup.
>>
>
> Letting the ping run for a while, I see
>
>> --- 10.35.18.172 ping statistics ---
>> 846 packets transmitted, 846 received, 0% packet loss, time 845917ms
>> rtt min/avg/max/mdev = 0.131/2.572/305.452/20.236 ms
>
> So it does reproduce.
I had a large Vista guest on that machine, so it was swapping heavily.
Without swapping, latency is never more than a few milliseconds, and
usually 150 microseconds.
--
error compiling committee.c: too many arguments to function
On Sun, 2009-01-18 at 11:21 +0200, Avi Kivity wrote:
> If the host load is low enough, it may be worthwhile to repeat with
> -no-kvm. It's significantly different from kvm (much higher cpu load,
> and less tasks involved), but if the problem recurs, we know it's a pure
> sched issue.
Okay I repeated the test as best I could with the standard 2.6.28 kernel
and "-no-kvm" added to the command lines of all the guests. I couldn't
repeat exactly the same test due to some problems with the guests.
- Two of the XP guests were taking a looong time to complete their
startup and get into idle mode. I waited ~30 minutes for them to
settle, but they were still running with 100% CPU load, so I shut them
down.
- The two linux SMP guests would not boot unless I added "nosmp" to the
kernel command line in grub within the guest. Screenshots of where the
two guests got stuck booting are attached.
Here are the results of the ping test, FWIW:
--- hermes-old.wumi.org.au ping statistics ---
900 packets transmitted, 900 received, 0% packet loss, time 903042ms
rtt min/avg/max/mdev = 0.388/2.389/73.171/3.685 ms
There's probably a few too many things different with this test compared
to the previous ones, but I guess it does point to the problem being an
interaction between both the scheduler and kvm. It may also be that it
requires smp guests to be running to trigger.
Regards,
Kevin.
On Sat, 2009-01-17 at 13:00 +0100, Peter Zijlstra wrote:
> On Sat, 2009-01-17 at 11:34 +0100, Mike Galbraith wrote:
> > > Right, how about we flip the 'initial' case in place_entity() for !
> > > nr_exclusive wakeups.
> >
> > Wouldn't that be more drastic than sleep denial?
>
> Strictly speaking that DEBIT thing is valid for each wakeup, IIRC we
> restricted it to clone() only because that was where we could actually
> observe these latency spikes using a fork-bomb.
>
> This reduces the latency hits to around 400ms, which is about right for
> the given load.
Disregarding the startup landmine for the moment, maybe we should put a
buddy slice knob in the user's hands, so they can tune latency, along
with a full on/off switch for those who care not one whit about
scalability.
ProcessSchedulerTest 100 100000
2.6.29.git
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:691.454ms|duration:-0.324s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:700.731ms|duration:-0.407s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:688.367ms|duration:-0.388s
NO_LAST_BUDDY
min:0.003ms|avg:0.003-0.004ms|mid:0.004ms|max:90.659ms|duration:0.035s
min:0.003ms|avg:0.003-0.004ms|mid:0.004ms|max:94.995ms|duration:0.022s
min:0.003ms|avg:0.003-0.004ms|mid:0.004ms|max:75.753ms|duration:0.148s
2.6.29.git + buddy_slice.diff
NO_BUDDIES
min:0.003ms|avg:0.003-0.024ms|mid:0.012ms|max:14.548ms|duration:0.731s
min:0.003ms|avg:0.003-0.028ms|mid:0.015ms|max:14.986ms|duration:0.760s
min:0.003ms|avg:0.003-0.028ms|mid:0.019ms|max:15.257ms|duration:0.782s
BUDDIES
sched_buddy_slice_ns=100000
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:21.199ms|duration:-0.101s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:21.602ms|duration:-0.030s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:18.421ms|duration:-0.124s
sched_buddy_slice_ns=1000000
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:55.067ms|duration:-0.224s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:58.090ms|duration:-0.036s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:72.055ms|duration:0.025s
sched_buddy_slice_ns=2000000
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:244.128ms|duration:-0.052s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:230.404ms|duration:-0.153s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:229.958ms|duration:0.030s
sched_buddy_slice_ns=4000000 (default)
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:396.093ms|duration:-0.016s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:366.363ms|duration:-0.055s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:360.373ms|duration:-0.129s
sched_buddy_slice_ns=15000000
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:670.781ms|duration:-0.086s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:563.612ms|duration:-0.049s
min:0.003ms|avg:0.003-0.004ms|mid:0.003ms|max:680.968ms|duration:-0.244s
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b8..0ea8eb7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1708,6 +1708,7 @@ static inline void wake_up_idle_cpu(int cpu) { }
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_buddy_slice;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c..f37c243 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,6 +406,7 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+ u64 pair_start;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c16..e261cd5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -43,6 +43,12 @@ unsigned int sysctl_sched_latency = 20000000ULL;
unsigned int sysctl_sched_min_granularity = 4000000ULL;
/*
+ * Buddy timeslice:
+ * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_buddy_slice = 4000000ULL;
+
+/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 5;
@@ -808,6 +814,11 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ u64 buddy_slice = rq->clock - cfs_rq->pair_start;
+
+ if (buddy_slice > sysctl_sched_buddy_slice)
+ goto out;
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
return cfs_rq->next;
@@ -815,6 +826,9 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
return cfs_rq->last;
+out:
+ cfs_rq->pair_start = rq->clock;
+
return se;
}
@@ -1347,6 +1361,9 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
+ if (!sched_feat(BUDDIES))
+ return;
+
if (likely(task_of(se)->policy != SCHED_IDLE)) {
for_each_sched_entity(se)
cfs_rq_of(se)->last = se;
@@ -1355,6 +1372,9 @@ static void set_last_buddy(struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se)
{
+ if (!sched_feat(BUDDIES))
+ return;
+
if (likely(task_of(se)->policy != SCHED_IDLE)) {
for_each_sched_entity(se)
cfs_rq_of(se)->next = se;
@@ -1392,7 +1412,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
* Also, during early boot the idle thread is in the fair class, for
* obvious reasons its a bad idea to schedule back to the idle thread.
*/
- if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ if (likely(se->on_rq && curr != rq->idle))
set_last_buddy(se);
set_next_buddy(pse);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b..3a194fa 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,4 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
-SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(BUDDIES, 1)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 368d163..733ddb6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -254,6 +254,17 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_buddy_slice_ns",
+ .data = &sysctl_sched_buddy_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_nr_latency_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_granularity_ns,
+ .extra2 = &max_sched_granularity_ns,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_latency_ns",
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
On Sun, 2009-01-18 at 15:08 +0100, Mike Galbraith wrote:
> On Sat, 2009-01-17 at 13:00 +0100, Peter Zijlstra wrote:
> > On Sat, 2009-01-17 at 11:34 +0100, Mike Galbraith wrote:
> > > > Right, how about we flip the 'initial' case in place_entity() for !
> > > > nr_exclusive wakeups.
> > >
> > > Wouldn't that be more drastic than sleep denial?
> >
> > Strictly speaking that DEBIT thing is valid for each wakeup, IIRC we
> > restricted it to clone() only because that was where we could actually
> > observe these latency spikes using a fork-bomb.
> >
> > This reduces the latency hits to around 400ms, which is about right for
> > the given load.
>
> Disregarding the startup landmine for the moment, maybe we should put a
> buddy slice knob in the user's hands, so they can tune latency, along
> with a full on/off switch for those who care not one whit about
> scalability.
I'm not particularly fond of that idea because it only helps for this
particular workload where any one task isn't actually running for very
long.
If however your workload consists of cpu hogs, each will run for the
full wakeup preemption 'slice' you now see these buddy pairs do.
Also, buddies really work for throughput, so I don't particularly fancy
weakening them below what a normal cpu-hog can already do.
On Sun, 2009-01-18 at 16:28 +0100, Peter Zijlstra wrote:
> On Sun, 2009-01-18 at 15:08 +0100, Mike Galbraith wrote:
> > On Sat, 2009-01-17 at 13:00 +0100, Peter Zijlstra wrote:
> > > On Sat, 2009-01-17 at 11:34 +0100, Mike Galbraith wrote:
> > > > > Right, how about we flip the 'initial' case in place_entity() for !
> > > > > nr_exclusive wakeups.
> > > >
> > > > Wouldn't that be more drastic than sleep denial?
> > >
> > > Strictly speaking that DEBIT thing is valid for each wakeup, IIRC we
> > > restricted it to clone() only because that was where we could actually
> > > observe these latency spikes using a fork-bomb.
> > >
> > > This reduces the latency hits to around 400ms, which is about right for
> > > the given load.
> >
> > Disregarding the startup landmine for the moment, maybe we should put a
> > buddy slice knob in the user's hands, so they can tune latency, along
> > with a full on/off switch for those who care not one whit about
> > scalability.
>
> I'm not particularly fond of that idea because it only helps for this
> particular workload where any one task isn't actually running for very
> long.
Well, there are a lot of buddy loads where runtime is very short. You
can achieve ~similar latency control, but not as fine, by twiddling
wakeup latency. At least you can prevent the triple digit latency with
12 (and a half) pairs per core. I can imagine server loads not liking
600-700ms latencies. I was just thinking that with two knobs, you can
keep the benefits of relatively high wakeup granularity, yet have a user
tweakable throughput vs latency adjustment for buddies to complement
wakeup latency.
> If however your workload consists of cpu hogs, each will run for the
> full wakeup preemption 'slice' you now see these buddy pairs do.
Hm. I had a whack buddy tags if you are one at tick in there, but
removed it pending measurement. I was wondering if a last buddy hog
could end up getting the CPU back after having received his quanta and
being resched, but haven't checked that yet.
> Also, buddies really work for throughput, so I don't particularly fancy
> weakening them below what a normal cpu-hog can already do.
Yeah, I was thinking of the user who where latency rules, very select
group. If someone has a really latency sensitive load, they can turn
LAST_BUDDY off, but that doesn't help with the latency that NEXT_BUDDY
causes. Almost everyone really really wants buddies, but not all.
Buddies are wonderful for cache, wonderful for throughput, but are not
wonderful for latency, ergo the thought that maybe we should offer a
full off switch for those who know the consequences.
It was just a thought.
-Mike
On Sun, 2009-01-18 at 19:54 +0100, Mike Galbraith wrote:
> On Sun, 2009-01-18 at 16:28 +0100, Peter Zijlstra wrote:
> >
> > If however your workload consists of cpu hogs, each will run for the
> > full wakeup preemption 'slice' you now see these buddy pairs do.
>
> Hm. I had a whack buddy tags if you are one at tick in there, but
> removed it pending measurement. I was wondering if a last buddy hog
> could end up getting the CPU back after having received his quanta and
> being resched, but haven't checked that yet.
Dunno if this really needs fixing, but it does happen, and frequently.
Buddies can be selected over waiting tasks despite having just received their
full slice and more. Fix this by clearing the buddy tag in put_prev_entity()
or check_preempt_tick() if they've received their fair share.
Clear buddy status once a task has received it's fair share.
Signed-off-by: Mike Galbraith <[email protected]>
---
kernel/sched_fair.c | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -768,8 +768,10 @@ check_preempt_tick(struct cfs_rq *cfs_rq
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec >= ideal_runtime) {
+ clear_buddies(cfs_rq, curr);
resched_task(rq_of(cfs_rq)->curr);
+ }
}
static void
@@ -818,6 +820,33 @@ static struct sched_entity *pick_next_en
return se;
}
+static void cond_clear_buddy(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+ s64 delta_exec = prev->sum_exec_runtime;
+ u64 min = sysctl_sched_min_granularity;
+
+ /*
+ * We need to clear buddy status if the previous task has received it's
+ * fair share, but we don't want to increase overhead significantly for
+ * fast/light tasks by calling sched_slice() too frequently.
+ */
+ if (unlikely(prev->load.weight != NICE_0_LOAD)) {
+ struct load_weight load;
+
+ load.weight = prio_to_weight[NICE_TO_PRIO(0) - MAX_RT_PRIO];
+ load.inv_weight = prio_to_wmult[NICE_TO_PRIO(0) - MAX_RT_PRIO];
+ min = calc_delta_mine(min, prev->load.weight, &load);
+ }
+
+ delta_exec -= prev->prev_sum_exec_runtime;
+
+ if (delta_exec > min) {
+ delta_exec -= sched_slice(cfs_rq, prev);
+ if (delta_exec >= 0)
+ clear_buddies(cfs_rq, prev);
+ }
+}
+
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -829,6 +858,8 @@ static void put_prev_entity(struct cfs_r
check_spread(cfs_rq, prev);
if (prev->on_rq) {
+ if (prev == cfs_rq->next || prev == cfs_rq->last)
+ cond_clear_buddy(cfs_rq, prev);
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);