Remove degenerate scheduler domains during the sched-domain init.
For example on x86_64, we always have NUMA configured in. On Intel EM64T
systems, top most sched domain will be of NUMA and with only one sched_group in
it.
With fork/exec balances(recent Nick's fixes in -mm tree), we always endup
taking wrong decisions because of this topmost domain (as it contains only
one group and find_idlest_group always returns NULL). We will endup loading
HT package completely first, letting active load balance kickin and correct it.
In general, this patch also makes sense with out recent Nick's fixes
in -mm.
Signed-off-by: Suresh Siddha <[email protected]>
Modified to account for more than just sched_groups when scanning for
degenerate domains by Nick Piggin. Allow a runqueue's sd to go NULL, which
required small changes to the smtnice code.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2005-04-05 16:38:21.000000000 +1000
+++ linux-2.6/kernel/sched.c 2005-04-05 18:39:09.000000000 +1000
@@ -2583,11 +2583,15 @@ out:
#ifdef CONFIG_SCHED_SMT
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
int i;
+
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!sd)
return;
/*
@@ -2628,13 +2632,17 @@ static inline void wake_sleeping_depende
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
prio_array_t *array;
int ret = 0, i;
task_t *p;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
+
+ if (!sd)
return 0;
/*
@@ -4604,6 +4612,11 @@ static void sched_domain_debug(struct sc
{
int level = 0;
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
do {
@@ -4809,6 +4822,50 @@ static void init_sched_domain_sysctl(voi
}
#endif
+static int __devinit sd_degenerate(struct sched_domain *sd)
+{
+ if (cpus_weight(sd->span) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_IDLE |
+ SD_WAKE_AFFINE |
+ SD_WAKE_BALANCE))
+ return 0;
+
+ return 1;
+}
+
+static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+ struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpus_equal(sd->span, parent->span))
+ return 0;
+
+ /* Does parent contain flags not in child? */
+ /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+ if (cflags & SD_WAKE_AFFINE)
+ pflags &= ~SD_WAKE_BALANCE;
+ if ((~sd->flags) & parent->flags)
+ return 0;
+
+ return 1;
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -4819,6 +4876,19 @@ void __devinit cpu_attach_domain(struct
unsigned long flags;
runqueue_t *rq = cpu_rq(cpu);
int local = 1;
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; tmp = tmp->parent) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+ if (sd_parent_degenerate(tmp, parent))
+ tmp->parent = parent->parent;
+ }
+
+ if (sd_degenerate(sd))
+ sd = sd->parent;
sched_domain_debug(sd, cpu);
The previous patch fixed the last 2 places that directly access a
runqueue's sched-domain and assume it cannot be NULL.
We can now use a NULL domain instead of a dummy domain to signify
no balancing is to happen. No functional changes.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2005-04-05 16:38:40.000000000 +1000
+++ linux-2.6/kernel/sched.c 2005-04-05 18:39:08.000000000 +1000
@@ -4887,7 +4887,7 @@ void __devinit cpu_attach_domain(struct
tmp->parent = parent->parent;
}
- if (sd_degenerate(sd))
+ if (sd && sd_degenerate(sd))
sd = sd->parent;
sched_domain_debug(sd, cpu);
@@ -5054,7 +5054,7 @@ static void __devinit arch_init_sched_do
cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
/*
- * Set up domains. Isolated domains just stay on the dummy domain.
+ * Set up domains. Isolated domains just stay on the NULL domain.
*/
for_each_cpu_mask(i, cpu_default_map) {
int group;
@@ -5167,18 +5167,11 @@ static void __devinit arch_destroy_sched
#endif /* ARCH_HAS_SCHED_DOMAIN */
-/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
- */
-static struct sched_domain sched_domain_dummy;
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
* and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
* which will prevent rebalancing while the sched domains are recalculated.
*/
static int update_sched_domains(struct notifier_block *nfb,
@@ -5190,7 +5183,7 @@ static int update_sched_domains(struct n
case CPU_UP_PREPARE:
case CPU_DOWN_PREPARE:
for_each_online_cpu(i)
- cpu_attach_domain(&sched_domain_dummy, i);
+ cpu_attach_domain(NULL, i);
arch_destroy_sched_domains();
return NOTIFY_OK;
@@ -5253,7 +5246,7 @@ void __init sched_init(void)
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
- rq->sd = &sched_domain_dummy;
+ rq->sd = NULL;
for (j = 1; j < 3; j++)
rq->cpu_load[j] = 0;
rq->active_balance = 0;
The fundamental problem that Suresh has with balance on exec and fork
is that it only tries to balance the top level domain with the flag
set.
This was worked around by removing degenerate domains, but is still a
problem if people want to start using more complex sched-domains, especially
multilevel NUMA that ia64 is already using.
This patch makes balance on fork and exec try balancing over not just the
top most domain with the flag set, but all the way down the domain tree.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2005-04-05 16:38:53.000000000 +1000
+++ linux-2.6/kernel/sched.c 2005-04-05 18:39:07.000000000 +1000
@@ -1320,21 +1320,24 @@ void fastcall wake_up_new_task(task_t *
sd = tmp;
if (sd) {
+ cpumask_t span;
int new_cpu;
struct sched_group *group;
+again:
schedstat_inc(sd, sbf_cnt);
+ span = sd->span;
cpu = task_cpu(p);
group = find_idlest_group(sd, p, cpu);
if (!group) {
schedstat_inc(sd, sbf_balanced);
- goto no_forkbalance;
+ goto nextlevel;
}
new_cpu = find_idlest_cpu(group, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
schedstat_inc(sd, sbf_balanced);
- goto no_forkbalance;
+ goto nextlevel;
}
if (cpu_isset(new_cpu, p->cpus_allowed)) {
@@ -1344,9 +1347,21 @@ void fastcall wake_up_new_task(task_t *
rq = task_rq_lock(p, &flags);
cpu = task_cpu(p);
}
+
+ /* Now try balancing at a lower domain level */
+nextlevel:
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (cpus_subset(span, tmp->span))
+ break;
+ if (tmp->flags & SD_BALANCE_FORK)
+ sd = tmp;
+ }
+
+ if (sd)
+ goto again;
}
-no_forkbalance:
#endif
/*
* We decrease the sleep average of forking parents
@@ -1712,25 +1727,41 @@ void sched_exec(void)
sd = tmp;
if (sd) {
+ cpumask_t span;
struct sched_group *group;
+again:
schedstat_inc(sd, sbe_cnt);
+ span = sd->span;
group = find_idlest_group(sd, current, this_cpu);
if (!group) {
schedstat_inc(sd, sbe_balanced);
- goto out;
+ goto nextlevel;
}
new_cpu = find_idlest_cpu(group, this_cpu);
if (new_cpu == -1 || new_cpu == this_cpu) {
schedstat_inc(sd, sbe_balanced);
- goto out;
+ goto nextlevel;
}
schedstat_inc(sd, sbe_pushed);
put_cpu();
sched_migrate_task(current, new_cpu);
- return;
+
+ /* Now try balancing at a lower domain level */
+ this_cpu = get_cpu();
+nextlevel:
+ sd = NULL;
+ for_each_domain(this_cpu, tmp) {
+ if (cpus_subset(span, tmp->span))
+ break;
+ if (tmp->flags & SD_BALANCE_EXEC)
+ sd = tmp;
+ }
+
+ if (sd)
+ goto again;
}
-out:
+
put_cpu();
}
One of the problems with the multilevel balance-on-fork/exec is that it
needs to jump through hoops to satisfy sched-domain's locking semantics
(that is, you may traverse your own domain when not preemptable, and
you may traverse others' domains when holding their runqueue lock).
balance-on-exec had to potentially migrate between more than one CPU before
finding a final CPU to migrate to, and balance-on-fork needed to potentially
take multiple runqueue locks.
So bite the bullet and make sched-domains go completely RCU. This actually
simplifies the code quite a bit.
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2005-04-05 16:39:14.000000000 +1000
+++ linux-2.6/kernel/sched.c 2005-04-05 18:39:05.000000000 +1000
@@ -825,22 +825,12 @@ inline int task_curr(const task_t *p)
}
#ifdef CONFIG_SMP
-enum request_type {
- REQ_MOVE_TASK,
- REQ_SET_DOMAIN,
-};
-
typedef struct {
struct list_head list;
- enum request_type type;
- /* For REQ_MOVE_TASK */
task_t *task;
int dest_cpu;
- /* For REQ_SET_DOMAIN */
- struct sched_domain *sd;
-
struct completion done;
} migration_req_t;
@@ -862,7 +852,6 @@ static int migrate_task(task_t *p, int d
}
init_completion(&req->done);
- req->type = REQ_MOVE_TASK;
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
@@ -4365,17 +4354,9 @@ static int migration_thread(void * data)
req = list_entry(head->next, migration_req_t, list);
list_del_init(head->next);
- if (req->type == REQ_MOVE_TASK) {
- spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
- local_irq_enable();
- } else if (req->type == REQ_SET_DOMAIN) {
- rq->sd = req->sd;
- spin_unlock_irq(&rq->lock);
- } else {
- spin_unlock_irq(&rq->lock);
- WARN_ON(1);
- }
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ local_irq_enable();
complete(&req->done);
}
@@ -4606,7 +4587,6 @@ static int migration_call(struct notifie
migration_req_t *req;
req = list_entry(rq->migration_queue.next,
migration_req_t, list);
- BUG_ON(req->type != REQ_MOVE_TASK);
list_del_init(&req->list);
complete(&req->done);
}
@@ -4903,10 +4883,7 @@ static int __devinit sd_parent_degenerat
*/
void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
{
- migration_req_t req;
- unsigned long flags;
runqueue_t *rq = cpu_rq(cpu);
- int local = 1;
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
@@ -4923,24 +4900,7 @@ void __devinit cpu_attach_domain(struct
sched_domain_debug(sd, cpu);
- spin_lock_irqsave(&rq->lock, flags);
-
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
- rq->sd = sd;
- } else {
- init_completion(&req.done);
- req.type = REQ_SET_DOMAIN;
- req.sd = sd;
- list_add(&req.list, &rq->migration_queue);
- local = 0;
- }
-
- spin_unlock_irqrestore(&rq->lock, flags);
-
- if (!local) {
- wake_up_process(rq->migration_thread);
- wait_for_completion(&req.done);
- }
+ rq->sd = sd;
}
/* cpus with isolated domains */
@@ -5215,6 +5175,7 @@ static int update_sched_domains(struct n
case CPU_DOWN_PREPARE:
for_each_online_cpu(i)
cpu_attach_domain(NULL, i);
+ synchronize_kernel();
arch_destroy_sched_domains();
return NOTIFY_OK;
Consolidate balance-on-exec with balance-on-fork. This is made easy
by the sched-domains RCU patches.
As well as the general goodness of code reduction, this allows
the runqueues to be unlocked during balance-on-fork.
schedstats is a problem. Maybe just have balance-on-event instead
of distinguishing fork and exec?
Signed-off-by: Nick Piggin <[email protected]>
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c 2005-04-05 18:39:14.000000000 +1000
+++ linux-2.6/kernel/sched.c 2005-04-05 18:40:18.000000000 +1000
@@ -1013,8 +1013,57 @@ static int find_idlest_cpu(struct sched_
return idlest;
}
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
-#endif
+ for_each_domain(cpu, tmp)
+ if (tmp->flags & flag)
+ sd = tmp;
+
+ while (sd) {
+ cpumask_t span;
+ struct sched_group *group;
+ int new_cpu;
+
+ span = sd->span;
+ group = find_idlest_group(sd, t, cpu);
+ if (!group)
+ goto nextlevel;
+
+ new_cpu = find_idlest_cpu(group, cpu);
+ if (new_cpu == -1 || new_cpu == cpu)
+ goto nextlevel;
+
+ /* Now try balancing at a lower domain level */
+ cpu = new_cpu;
+nextlevel:
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (cpus_subset(span, tmp->span))
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return cpu;
+}
+
+#endif /* CONFIG_SMP */
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1295,63 +1344,22 @@ void fastcall wake_up_new_task(task_t *
int this_cpu, cpu;
runqueue_t *rq, *this_rq;
#ifdef CONFIG_SMP
- struct sched_domain *tmp, *sd = NULL;
-#endif
+ int new_cpu;
+ cpu = task_cpu(p);
+ preempt_disable();
+ new_cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+ preempt_enable();
+ if (new_cpu != cpu)
+ set_task_cpu(p, new_cpu);
+#endif
+
+ cpu = task_cpu(p);
rq = task_rq_lock(p, &flags);
- BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id();
- cpu = task_cpu(p);
-
-#ifdef CONFIG_SMP
- for_each_domain(cpu, tmp)
- if (tmp->flags & SD_BALANCE_FORK)
- sd = tmp;
-
- if (sd) {
- cpumask_t span;
- int new_cpu;
- struct sched_group *group;
-
-again:
- schedstat_inc(sd, sbf_cnt);
- span = sd->span;
- cpu = task_cpu(p);
- group = find_idlest_group(sd, p, cpu);
- if (!group) {
- schedstat_inc(sd, sbf_balanced);
- goto nextlevel;
- }
- new_cpu = find_idlest_cpu(group, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- schedstat_inc(sd, sbf_balanced);
- goto nextlevel;
- }
-
- if (cpu_isset(new_cpu, p->cpus_allowed)) {
- schedstat_inc(sd, sbf_pushed);
- set_task_cpu(p, new_cpu);
- task_rq_unlock(rq, &flags);
- rq = task_rq_lock(p, &flags);
- cpu = task_cpu(p);
- }
-
- /* Now try balancing at a lower domain level */
-nextlevel:
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (cpus_subset(span, tmp->span))
- break;
- if (tmp->flags & SD_BALANCE_FORK)
- sd = tmp;
- }
-
- if (sd)
- goto again;
- }
+ BUG_ON(p->state != TASK_RUNNING);
-#endif
/*
* We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks
@@ -1699,59 +1707,17 @@ out:
task_rq_unlock(rq, &flags);
}
-/*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
*/
void sched_exec(void)
{
- struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
-
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
-
- if (sd) {
- cpumask_t span;
- struct sched_group *group;
-again:
- schedstat_inc(sd, sbe_cnt);
- span = sd->span;
- group = find_idlest_group(sd, current, this_cpu);
- if (!group) {
- schedstat_inc(sd, sbe_balanced);
- goto nextlevel;
- }
- new_cpu = find_idlest_cpu(group, this_cpu);
- if (new_cpu == -1 || new_cpu == this_cpu) {
- schedstat_inc(sd, sbe_balanced);
- goto nextlevel;
- }
-
- schedstat_inc(sd, sbe_pushed);
- put_cpu();
- sched_migrate_task(current, new_cpu);
-
- /* Now try balancing at a lower domain level */
- this_cpu = get_cpu();
-nextlevel:
- sd = NULL;
- for_each_domain(this_cpu, tmp) {
- if (cpus_subset(span, tmp->span))
- break;
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
- }
-
- if (sd)
- goto again;
- }
-
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
put_cpu();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
}
/*
* Nick Piggin <[email protected]> wrote:
> This is Suresh's patch with some modifications.
> Remove degenerate scheduler domains during the sched-domain init.
actually, i'd suggest to not do this patch. The point of booting with a
CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the
'degenerate' toplevel domain exposed conceptual bugs in the
sched-domains code. In that sense removing such 'unnecessary' domains
inhibits debuggability to a certain degree. If we had this patch earlier
we'd not have experienced the wrong decisions taken by the scheduler,
only on the much rarer 'really NUMA' boxes.
is there any case where we'd want to simplify the domain tree? One more
domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd
not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the
domain tree got optimized. Hm?
Ingo
* Ingo Molnar <[email protected]> wrote:
>
> * Nick Piggin <[email protected]> wrote:
>
> > 2/5
>
> > The previous patch fixed the last 2 places that directly access a
> > runqueue's sched-domain and assume it cannot be NULL.
> >
> > We can now use a NULL domain instead of a dummy domain to signify
> > no balancing is to happen. No functional changes.
> >
> > Signed-off-by: Nick Piggin <[email protected]>
>
> Acked-by: Ingo Molnar <[email protected]>
if the previous 'remove degenerate domains' patch would go away then
this patch needs to be merged/modified. (and most of the others as well)
Ingo
* Nick Piggin <[email protected]> wrote:
> 2/5
> The previous patch fixed the last 2 places that directly access a
> runqueue's sched-domain and assume it cannot be NULL.
>
> We can now use a NULL domain instead of a dummy domain to signify
> no balancing is to happen. No functional changes.
>
> Signed-off-by: Nick Piggin <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Ingo
* Nick Piggin <[email protected]> wrote:
> 3/5
> The fundamental problem that Suresh has with balance on exec and fork
> is that it only tries to balance the top level domain with the flag
> set.
>
> This was worked around by removing degenerate domains, but is still a
> problem if people want to start using more complex sched-domains, especially
> multilevel NUMA that ia64 is already using.
>
> This patch makes balance on fork and exec try balancing over not just the
> top most domain with the flag set, but all the way down the domain tree.
>
> Signed-off-by: Nick Piggin <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
note that no matter how much scheduler logic, in the end
cross-scheduling of tasks between nodes on NUMA will always have a
permanent penalty (i.e. the 'migration cost' is 'infinity' in the long
run), so the primary focus _hast to be_ on 'get it right initially' When
tasks must spill over to other nodes will always remain a special case.
So balance-on-fork/exec/[clone] definitely needs to be aware of the full
domain tree picture.
Ingo
* Nick Piggin <[email protected]> wrote:
> 4/5
> One of the problems with the multilevel balance-on-fork/exec is that
> it needs to jump through hoops to satisfy sched-domain's locking
> semantics (that is, you may traverse your own domain when not
> preemptable, and you may traverse others' domains when holding their
> runqueue lock).
>
> balance-on-exec had to potentially migrate between more than one CPU
> before finding a final CPU to migrate to, and balance-on-fork needed
> to potentially take multiple runqueue locks.
>
> So bite the bullet and make sched-domains go completely RCU. This
> actually simplifies the code quite a bit.
>
> Signed-off-by: Nick Piggin <[email protected]>
i like it conceptually, so:
Acked-by: Ingo Molnar <[email protected]>
from now on, all domain-tree readonly uses have to be rcu_read_lock()-ed
(or otherwise have to be in a non-preemptible section). But there's a
bug in show_shedstats() which does a for_each_domain() from within a
preemptible section. (It was a bug with the current hotplug logic too i
think.)
At a minimum i think we need the fix+comment below.
Ingo
Signed-off-by: Ingo Molnar <[email protected]>
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -260,6 +260,10 @@ struct runqueue {
static DEFINE_PER_CPU(struct runqueue, runqueues);
+/*
+ * The domain tree (rq->sd) is RCU locked. I.e. it may only be accessed
+ * from within an rcu_read_lock() [or otherwise preempt-disabled] sections.
+ */
#define for_each_domain(cpu, domain) \
for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
@@ -338,6 +342,7 @@ static int show_schedstat(struct seq_fil
#ifdef CONFIG_SMP
/* domain-specific stats */
+ rcu_read_lock();
for_each_domain(cpu, sd) {
enum idle_type itype;
char mask_str[NR_CPUS];
@@ -361,6 +366,7 @@ static int show_schedstat(struct seq_fil
sd->sbe_pushed, sd->sbe_attempts,
sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
}
+ rcu_read_unlock();
#endif
}
return 0;
* Nick Piggin <[email protected]> wrote:
> 5/5
>
> Any ideas about what to do with schedstats?
> Do we really need balance on exec and fork as seperate
> statistics?
> Consolidate balance-on-exec with balance-on-fork. This is made easy
> by the sched-domains RCU patches.
>
> As well as the general goodness of code reduction, this allows
> the runqueues to be unlocked during balance-on-fork.
>
> schedstats is a problem. Maybe just have balance-on-event instead
> of distinguishing fork and exec?
>
> Signed-off-by: Nick Piggin <[email protected]>
looks good.
Acked-by: Ingo Molnar <[email protected]>
while the code is now consolidated, i think we still need the separate
fork/exec stats for schedstat. We have 3 conceptual ways to start off a
new context: fork(), pthread_create() and execve(), and applications use
them in different patterns. We have different flags and tuning
parameters for two of them (which might have to become 3, i'm not
entirely convinced we'll be able to ignore the 'process vs. thread'
condition in wake_up_new_task(), STREAM is a really bad example in that
sense), so we need 2 (or 3) separate stats.
Ingo
On Wed, Apr 06, 2005 at 07:44:12AM +0200, Ingo Molnar wrote:
>
> * Nick Piggin <[email protected]> wrote:
>
> > This is Suresh's patch with some modifications.
>
> > Remove degenerate scheduler domains during the sched-domain init.
>
> actually, i'd suggest to not do this patch. The point of booting with a
> CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the
Not really. All of the x86_64 kernels are NUMA enabled and most Intel x86_64
systems today are non NUMA.
> 'degenerate' toplevel domain exposed conceptual bugs in the
> sched-domains code. In that sense removing such 'unnecessary' domains
> inhibits debuggability to a certain degree. If we had this patch earlier
> we'd not have experienced the wrong decisions taken by the scheduler,
> only on the much rarer 'really NUMA' boxes.
>
> is there any case where we'd want to simplify the domain tree? One more
> domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd
> not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the
> domain tree got optimized. Hm?
>
Ingo, pardon me! Actually I used NUMA domain as an excuse to push domain
degenerate patch.... As I mentioned earlier, we should remove SMT domain
on a non-HT capable system.
Similarly I am working on adding a new core domain for dual-core systems!
All these domains are unnecessary and cause performance isssues on
non Multi-threading/Multi-core capable cpus! Agreed that performance
impact will be minor but still...
thanks,
suresh
* Siddha, Suresh B <[email protected]> wrote:
> Similarly I am working on adding a new core domain for dual-core
> systems! All these domains are unnecessary and cause performance
> isssues on non Multi-threading/Multi-core capable cpus! Agreed that
> performance impact will be minor but still...
ok, lets keep it then. It may in fact simplify the domain setup code: we
could generate the 'most generic' layout for a given arch all the time,
and then optimize it automatically. I.e. in theory we could have just a
single domain-setup routine, which would e.g. generate the NUMA domains
on SMP too, which would then be optimized away.
Ingo
Ingo Molnar wrote:
> * Nick Piggin <[email protected]> wrote:
>
>
>>This is Suresh's patch with some modifications.
>
>
>>Remove degenerate scheduler domains during the sched-domain init.
>
>
> actually, i'd suggest to not do this patch. The point of booting with a
> CONFIG_NUMA kernel on a non-NUMA box is mostly for testing, and the
> 'degenerate' toplevel domain exposed conceptual bugs in the
> sched-domains code. In that sense removing such 'unnecessary' domains
> inhibits debuggability to a certain degree. If we had this patch earlier
> we'd not have experienced the wrong decisions taken by the scheduler,
> only on the much rarer 'really NUMA' boxes.
>
True. Although I'd imagine it may be something distros may want.
For example, a generic x86-64 kernel for both AMD and Intel systems
could easily have SMT and NUMA turned on.
I agree with the downside of exercising less code paths though.
What about putting as a (default to off for 2.6) config option in
the config embedded menu?
> is there any case where we'd want to simplify the domain tree? One more
> domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd
> not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the
> domain tree got optimized. Hm?
>
I guess there is the SMT issue too, and even booting an SMP kernel
on a UP system. Also small ia64 NUMA systems will probably have one
redundant NUMA level.
If/when topologies get more complex (for example, the recent Altix
discussions we had with Paul), it will be generally easier to set
up all levels in a generic way, then weed them out using something
like this, rather than put the logic in the domain setup code.
Nick
--
SUSE Labs, Novell Inc.
Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
>
>>* Nick Piggin <[email protected]> wrote:
>>
>>
>>>2/5
>>
>>>The previous patch fixed the last 2 places that directly access a
>>>runqueue's sched-domain and assume it cannot be NULL.
>>>
>>>We can now use a NULL domain instead of a dummy domain to signify
>>>no balancing is to happen. No functional changes.
>>>
>>>Signed-off-by: Nick Piggin <[email protected]>
>>
>>Acked-by: Ingo Molnar <[email protected]>
^^^
Thanks.
>
>
> if the previous 'remove degenerate domains' patch would go away then
> this patch needs to be merged/modified. (and most of the others as well)
>
I probably should respin this so it goes in *first* anyway.
Rather than doing half in the remove degenerate domains and
half here.
--
SUSE Labs, Novell Inc.
Ingo Molnar wrote:
> Acked-by: Ingo Molnar <[email protected]>
>
> note that no matter how much scheduler logic, in the end
> cross-scheduling of tasks between nodes on NUMA will always have a
> permanent penalty (i.e. the 'migration cost' is 'infinity' in the long
> run), so the primary focus _hast to be_ on 'get it right initially' When
> tasks must spill over to other nodes will always remain a special case.
> So balance-on-fork/exec/[clone] definitely needs to be aware of the full
> domain tree picture.
>
Yes, well put. I imagine this will only become more important as
there becomes more push towards multiprocessing machines, and the
need for higher memory bandwidth and lower latency to CPUs.
--
SUSE Labs, Novell Inc.
Ingo Molnar wrote:
> * Nick Piggin <[email protected]> wrote:
>
>
>>4/5
>
>
>>One of the problems with the multilevel balance-on-fork/exec is that
>>it needs to jump through hoops to satisfy sched-domain's locking
>>semantics (that is, you may traverse your own domain when not
>>preemptable, and you may traverse others' domains when holding their
>>runqueue lock).
>>
>>balance-on-exec had to potentially migrate between more than one CPU
>>before finding a final CPU to migrate to, and balance-on-fork needed
>>to potentially take multiple runqueue locks.
>>
>>So bite the bullet and make sched-domains go completely RCU. This
>>actually simplifies the code quite a bit.
>>
>>Signed-off-by: Nick Piggin <[email protected]>
>
>
> i like it conceptually, so:
>
> Acked-by: Ingo Molnar <[email protected]>
>
Oh good, thanks.
> from now on, all domain-tree readonly uses have to be rcu_read_lock()-ed
> (or otherwise have to be in a non-preemptible section). But there's a
> bug in show_shedstats() which does a for_each_domain() from within a
> preemptible section. (It was a bug with the current hotplug logic too i
> think.)
>
Ah, thanks. That looks like a bug in the code with the locking
we have now too...
> At a minimum i think we need the fix+comment below.
>
Well if we say "this is actually RCU", then yes. And we should
probably change the preempt_{dis|en}ables in other places to
rcu_read_lock.
OTOH, if we say we just want all running threads to process through
a preemption stage, then this would just be a preempt_disable/enable
pair.
In practice that makes no difference yet, but it looks like you and
Paul are working to distinguish these two cases in the RCU code, to
accomodate your low latency RCU stuff?
I'd prefer the latter (ie. just disable preempt, and use
synchronize_sched), but I'm not too sure of what is going on with
your the low latency RCU work...?
> Ingo
>
> Signed-off-by: Ingo Molnar <[email protected]>
>
Thanks for catching that. I may just push it through first as a fix
to the current 2.6 schedstats code (using preempt_disable), and
afterwards we can change it to rcu_read_lock if that is required.
--
SUSE Labs, Novell Inc.
Ingo Molnar wrote:
> * Nick Piggin <[email protected]> wrote:
>
>
>>5/5
>>
>>Any ideas about what to do with schedstats?
>>Do we really need balance on exec and fork as seperate
>>statistics?
>
>
>>Consolidate balance-on-exec with balance-on-fork. This is made easy
>>by the sched-domains RCU patches.
>>
>>As well as the general goodness of code reduction, this allows
>>the runqueues to be unlocked during balance-on-fork.
>>
>>schedstats is a problem. Maybe just have balance-on-event instead
>>of distinguishing fork and exec?
>>
>>Signed-off-by: Nick Piggin <[email protected]>
>
>
> looks good.
>
One problem I just noticed, sorry. This is doing set_cpus_allowed
without holding the runqueue lock and without checking the hard
affinity mask either.
We could just do a set_cpus_allowed, or take the lock, set_cpus_allowed,
and take the new lock, but that's probably a bit heavy if we can avoid it.
In the interests of speed in this fast path, do you think we can do this
in sched_fork, before the task has even been put on the tasklist?
That would avoid all locking problems. Passing clone_flags into sched_fork
would not be a problem if we want to distinguish fork() and clone(CLONE_VM).
Yes? I'll cut a new patch to do just that.
> Acked-by: Ingo Molnar <[email protected]>
>
> while the code is now consolidated, i think we still need the separate
> fork/exec stats for schedstat.
This makes it a bit harder then, to get good stats in the sched-domain
(which is really what we want). It would basically mean doing
if (balance fork)
schedstat_inc(sbf_cnt);
else if (balance exec)
schedstat_inc(sbe_cnt);
etc.
That should all get optimised out by the compiler, but still a bit ugly.
Any ideas?
--
SUSE Labs, Novell Inc.
Ingo Molnar wrote:
> * Siddha, Suresh B <[email protected]> wrote:
>
>
>>Similarly I am working on adding a new core domain for dual-core
>>systems! All these domains are unnecessary and cause performance
>>isssues on non Multi-threading/Multi-core capable cpus! Agreed that
>>performance impact will be minor but still...
>
>
> ok, lets keep it then. It may in fact simplify the domain setup code: we
> could generate the 'most generic' layout for a given arch all the time,
> and then optimize it automatically. I.e. in theory we could have just a
> single domain-setup routine, which would e.g. generate the NUMA domains
> on SMP too, which would then be optimized away.
>
Yep, exactly. Even so, Andrew: please ignore this patch series
and I'll redo it for you when we all agree on everything.
Thanks.
--
SUSE Labs, Novell Inc.
Nick Piggin wrote:
>
> One problem I just noticed, sorry. This is doing set_cpus_allowed
> without holding the runqueue lock and without checking the hard
> affinity mask either.
>
Err, that is to say set_task_cpu, not set_cpus_allowed.
--
SUSE Labs, Novell Inc.
* Nick Piggin <[email protected]> wrote:
> [...] Although I'd imagine it may be something distros may want. For
> example, a generic x86-64 kernel for both AMD and Intel systems could
> easily have SMT and NUMA turned on.
yes, that's true - in fact reducing the number of separate kernel
packages is of utmost importance to all distributions. (I'm not sure we
are there yet with CONFIG_NUMA, but small steps wont hurt.)
> I agree with the downside of exercising less code paths though.
if we make CONFIG_NUMA good enough on small boxes so that distributors
can turn it on then in the long run the loss could be offset by the win
the extra QA gives.
> >is there any case where we'd want to simplify the domain tree? One more
> >domain level is just one (and very minor) aspect of CONFIG_NUMA - i'd
> >not want to run a CONFIG_NUMA kernel on a non-NUMA box, even if the
> >domain tree got optimized. Hm?
>
> I guess there is the SMT issue too, and even booting an SMP kernel on
> a UP system. Also small ia64 NUMA systems will probably have one
> redundant NUMA level.
i think most factors of not running an SMP kernel on a UP box are not
due scheduler overhead: the biggest cost is spinlock overhead. Someone
should try a little prototype: use the 'alternate instructions'
framework to patch out calls to spinlock functions to NOPs, and
benchmark the resulting kernel against UP. If it's "good enough",
distros will use it. Having just a single binary kernel RPM that
supports everything from NUMA through SMP to UP is the holy grail of
distros. (especially the ones that offer commercial support and
services.)
this is probably not possible on x86 - e.g. it would probably be
expensive (in terms of runtime cost) to make the PAE/non-PAE decision
runtime (the distro boot kernel needs to be non-PAE). But for newer
arches like x64 it should be easier.
> If/when topologies get more complex (for example, the recent Altix
> discussions we had with Paul), it will be generally easier to set up
> all levels in a generic way, then weed them out using something like
> this, rather than put the logic in the domain setup code.
ok. That should also make it easier to put more of the arch domain setup
code into sched.c. E.g. i'm still uneasy about it having so much
scheduler code in arch/ia64/kernel/domain.c, and all the ripple effects.
(the #ifdefs, include file impact, etc.)
Ingo
* Nick Piggin <[email protected]> wrote:
> > At a minimum i think we need the fix+comment below.
>
> Well if we say "this is actually RCU", then yes. And we should
> probably change the preempt_{dis|en}ables in other places to
> rcu_read_lock.
>
> OTOH, if we say we just want all running threads to process through a
> preemption stage, then this would just be a preempt_disable/enable
> pair.
>
> In practice that makes no difference yet, but it looks like you and
> Paul are working to distinguish these two cases in the RCU code, to
> accomodate your low latency RCU stuff?
it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler
itself always needs to be non-preemptible.
those few places where we currently do preempt_disable(), which should
thus be rcu_read_lock(), are never in codepaths that can take alot of
time.
but yes, in principle you are right, but in this particular (and
special) case it's not a big issue. We should document the RCU read-lock
dependencies cleanly and make all rcu-read-lock cases truly
rcu_read_lock(), but it's not a pressing issue even considering possible
future features like PREEMPT_RT.
the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT
if kernel code has an implicit 'spinlock means preempt-off and thus
RCU-read-lock' assumption. Most of the time these get discovered via
PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too,
so that is not a problem either.)
Ingo
* Nick Piggin <[email protected]> wrote:
> Nick Piggin wrote:
>
> >
> >One problem I just noticed, sorry. This is doing set_cpus_allowed
> >without holding the runqueue lock and without checking the hard
> >affinity mask either.
> >
>
> Err, that is to say set_task_cpu, not set_cpus_allowed.
yes. The whole cpus_allowed+set_task_cpu() section in copy_process()
should move into sched_fork().
Ingo
* Nick Piggin <[email protected]> wrote:
> We could just do a set_cpus_allowed, or take the lock,
> set_cpus_allowed, and take the new lock, but that's probably a bit
> heavy if we can avoid it. In the interests of speed in this fast path,
> do you think we can do this in sched_fork, before the task has even
> been put on the tasklist?
yeah, that shouldnt be a problem. Technically we set cpus_allowed up
under the tasklist lock just to be non-preemptible and to copy the
parent's _current_ affinity to the child. But sched_fork() is called
just before and if the parent got its affinity changed between the two
calls, so what? I'd move all of this code into sched_fork().
> That would avoid all locking problems. Passing clone_flags into
> sched_fork would not be a problem if we want to distinguish fork() and
> clone(CLONE_VM).
sure, that was the plan all along with sched_fork() anyway. (i think the
initial versions had the flag)
> Yes? I'll cut a new patch to do just that.
sure, fine by me.
Ingo
Ingo Molnar wrote:
> * Nick Piggin <[email protected]> wrote:
>
>
>>>At a minimum i think we need the fix+comment below.
>>
>>Well if we say "this is actually RCU", then yes. And we should
>>probably change the preempt_{dis|en}ables in other places to
>>rcu_read_lock.
>>
>>OTOH, if we say we just want all running threads to process through a
>>preemption stage, then this would just be a preempt_disable/enable
>>pair.
>>
>>In practice that makes no difference yet, but it looks like you and
>>Paul are working to distinguish these two cases in the RCU code, to
>>accomodate your low latency RCU stuff?
>
>
> it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler
> itself always needs to be non-preemptible.
>
> those few places where we currently do preempt_disable(), which should
> thus be rcu_read_lock(), are never in codepaths that can take alot of
> time.
>
> but yes, in principle you are right, but in this particular (and
> special) case it's not a big issue. We should document the RCU read-lock
> dependencies cleanly and make all rcu-read-lock cases truly
> rcu_read_lock(), but it's not a pressing issue even considering possible
> future features like PREEMPT_RT.
>
> the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT
> if kernel code has an implicit 'spinlock means preempt-off and thus
> RCU-read-lock' assumption. Most of the time these get discovered via
> PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too,
> so that is not a problem either.)
>
OK thanks for the good explanation. So I'll keep it as is for now,
and whatever needs cleaning up later can be worked out as it comes
up.
--
SUSE Labs, Novell Inc.
On Thu, Apr 07, 2005 at 05:58:40PM +1000, Nick Piggin wrote:
> Ingo Molnar wrote:
> >* Nick Piggin <[email protected]> wrote:
> >
> >
> >>>At a minimum i think we need the fix+comment below.
> >>
> >>Well if we say "this is actually RCU", then yes. And we should
> >>probably change the preempt_{dis|en}ables in other places to
> >>rcu_read_lock.
> >>
> >>OTOH, if we say we just want all running threads to process through a
> >>preemption stage, then this would just be a preempt_disable/enable
> >>pair.
> >>
> >>In practice that makes no difference yet, but it looks like you and
> >>Paul are working to distinguish these two cases in the RCU code, to
> >>accomodate your low latency RCU stuff?
> >
> >
> >it doesnt impact PREEMPT_RCU/PREEMPT_RT directly, because the scheduler
> >itself always needs to be non-preemptible.
> >
> >those few places where we currently do preempt_disable(), which should
> >thus be rcu_read_lock(), are never in codepaths that can take alot of
> >time.
> >
> >but yes, in principle you are right, but in this particular (and
> >special) case it's not a big issue. We should document the RCU read-lock
> >dependencies cleanly and make all rcu-read-lock cases truly
> >rcu_read_lock(), but it's not a pressing issue even considering possible
> >future features like PREEMPT_RT.
> >
> >the only danger in this area is to PREEMPT_RT: it is a bug on PREEMPT_RT
> >if kernel code has an implicit 'spinlock means preempt-off and thus
> >RCU-read-lock' assumption. Most of the time these get discovered via
> >PREEMPT_DEBUG. (preempt_disable() disables preemption on PREEMPT_RT too,
> >so that is not a problem either.)
> >
>
> OK thanks for the good explanation. So I'll keep it as is for now,
> and whatever needs cleaning up later can be worked out as it comes
> up.
Looking forward to the split of synchronize_kernel() into synchronize_rcu()
and synchronize_sched(), the two choices are:
o Use synchronize_rcu(), but insert rcu_read_lock()/rcu_read_unlock()
pairs on the read side.
o Use synchronize_sched(), and make sure all read-side code is
under preempt_disable().
Either way, there may also need to be some rcu_dereference()s when picking
up pointer and rcu_assign_pointer()s when updating the pointers.
For example, if traversing the domain parent list is to be RCU protected,
the for_each_domain() macro should change to something like:
#define for_each_domain(cpu, domain) \
for (domain = cpu_rq(cpu)->sd; domain; domain = rcu_dereference(domain->parent))
Thanx, Paul
Paul E. McKenney wrote:
> On Thu, Apr 07, 2005 at 05:58:40PM +1000, Nick Piggin wrote:
>
>>
>>OK thanks for the good explanation. So I'll keep it as is for now,
>>and whatever needs cleaning up later can be worked out as it comes
>>up.
>
>
> Looking forward to the split of synchronize_kernel() into synchronize_rcu()
> and synchronize_sched(), the two choices are:
>
> o Use synchronize_rcu(), but insert rcu_read_lock()/rcu_read_unlock()
> pairs on the read side.
>
> o Use synchronize_sched(), and make sure all read-side code is
> under preempt_disable().
>
Yep, I think we'll go for the second option initially (because that
pretty closely matches the homebrew locking scheme that it used to
use).
> Either way, there may also need to be some rcu_dereference()s when picking
> up pointer and rcu_assign_pointer()s when updating the pointers.
> For example, if traversing the domain parent list is to be RCU protected,
> the for_each_domain() macro should change to something like:
>
Yes, I think you're right, because there's no barriers or synchronisation
when attaching a new domain. Just a small point though:
> #define for_each_domain(cpu, domain) \
> for (domain = cpu_rq(cpu)->sd; domain; domain = rcu_dereference(domain->parent))
>
This should probably be done like so?
#define for_each_domain(cpu, domain) \
for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
And I think it would be wise to use rcu_assign_pointer in the update too.
Thanks Paul.
--
SUSE Labs, Novell Inc.