Register a cpu hotplug notifier which reinitializes the scheduler
domains hierarchy. The notifier temporarily attaches all running cpus
to a "dummy" domain (like we currently do during boot) to avoid
balancing. It then calls arch_init_sched_domains which rebuilds the
"real" domains and reattaches the cpus to them.
Also change __init attributes to __devinit where necessary.
Signed-off-by: Nathan Lynch <[email protected]>
---
diff -puN kernel/sched.c~sched-domains-cpu-hotplug-notifier kernel/sched.c
--- 2.6.9-rc1-bk14/kernel/sched.c~sched-domains-cpu-hotplug-notifier 2004-09-07 13:32:50.000000000 -0500
+++ 2.6.9-rc1-bk14-nathanl/kernel/sched.c 2004-09-07 13:34:38.000000000 -0500
@@ -4191,7 +4191,10 @@ spinlock_t kernel_flag __cacheline_align
EXPORT_SYMBOL(kernel_flag);
#ifdef CONFIG_SMP
-/* Attach the domain 'sd' to 'cpu' as its base domain */
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the cpu_control_semaphore.
+ */
static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
migration_req_t req;
@@ -4199,8 +4202,6 @@ static void cpu_attach_domain(struct sch
runqueue_t *rq = cpu_rq(cpu);
int local = 1;
- lock_cpu_hotplug();
-
spin_lock_irqsave(&rq->lock, flags);
if (cpu == smp_processor_id() || !cpu_online(cpu)) {
@@ -4219,8 +4220,6 @@ static void cpu_attach_domain(struct sch
wake_up_process(rq->migration_thread);
wait_for_completion(&req.done);
}
-
- unlock_cpu_hotplug();
}
#ifdef CONFIG_NUMA
@@ -4234,7 +4233,7 @@ static void cpu_attach_domain(struct sch
*
* Should use nodemask_t.
*/
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
+static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
{
int i, n, val, min_val, best_node = 0;
@@ -4270,7 +4269,7 @@ static int __init find_next_best_node(in
* should be one that prevents unnecessary balancing, but also spreads tasks
* out optimally.
*/
-cpumask_t __init sched_domain_node_span(int node, int size)
+cpumask_t __devinit sched_domain_node_span(int node, int size)
{
int i;
cpumask_t span;
@@ -4294,7 +4293,7 @@ cpumask_t __init sched_domain_node_span(
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
-__init static int cpu_to_cpu_group(int cpu)
+static int __devinit cpu_to_cpu_group(int cpu)
{
return cpu;
}
@@ -4302,7 +4301,7 @@ __init static int cpu_to_cpu_group(int c
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
-__init static int cpu_to_phys_group(int cpu)
+static int __devinit cpu_to_phys_group(int cpu)
{
#ifdef CONFIG_SCHED_SMT
return first_cpu(cpu_sibling_map[cpu]);
@@ -4318,7 +4317,7 @@ __init static int cpu_to_phys_group(int
static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group sched_group_nodes[MAX_NUMNODES];
-__init static int cpu_to_node_group(int cpu)
+static int __devinit cpu_to_node_group(int cpu)
{
return cpu_to_node(cpu);
}
@@ -4328,9 +4327,9 @@ __init static int cpu_to_node_group(int
static struct sched_group sched_group_isolated[NR_CPUS];
/* cpus with isolated domains */
-cpumask_t __initdata cpu_isolated_map = CPU_MASK_NONE;
+cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
-__init static int cpu_to_isolated_group(int cpu)
+static int __devinit cpu_to_isolated_group(int cpu)
{
return cpu;
}
@@ -4360,7 +4359,7 @@ __setup ("isolcpus=", isolated_cpu_setup
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-__init static void init_sched_build_groups(struct sched_group groups[],
+static void __devinit init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
@@ -4394,7 +4393,11 @@ __init static void init_sched_build_grou
last->next = first;
}
-__init static void arch_init_sched_domains(cpumask_t cpu_default_map)
+/*
+ * Set up scheduler domains and groups. Callers must hold the cpucontrol
+ * semaphore.
+ */
+static void __devinit arch_init_sched_domains(cpumask_t cpu_default_map)
{
int i;
/*
@@ -4630,10 +4633,61 @@ void sched_domain_debug(void)
#define sched_domain_debug() {}
#endif
+#ifdef CONFIG_SMP
+/* Initial dummy domain for early boot and for hotplug cpu */
+static struct sched_domain sched_domain_dummy;
+static struct sched_group sched_group_dummy;
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Force a reinitialization of the sched domains hierarchy. The domains
+ * and groups cannot be updated in place without racing with the balancing
+ * code, so we temporarily attach all running cpus to a "dummy" domain
+ * which will prevent rebalancing while the sched domains are recalculated.
+ */
+static int update_sched_domains(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ cpumask_t new_cpu_map = cpu_online_map;
+ int i, cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ cpu_set(cpu, new_cpu_map);
+ break;
+ case CPU_UP_CANCELED: /* fall through */
+ case CPU_DEAD:
+ /*
+ * No need to clear the dead cpu's bit, it has
+ * already been marked offline.
+ */
+ cpu_attach_domain(&sched_domain_dummy, cpu);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ for_each_online_cpu(i)
+ cpu_attach_domain(&sched_domain_dummy, i);
+
+ /* The cpucontrol semaphore is already held by cpu_up/cpu_down */
+ arch_init_sched_domains(new_cpu_map);
+
+ sched_domain_debug();
+
+ return NOTIFY_OK;
+}
+#endif
+
void __init sched_init_smp(void)
{
+ lock_cpu_hotplug();
arch_init_sched_domains(cpu_online_map);
sched_domain_debug();
+ unlock_cpu_hotplug();
+
+ hotcpu_notifier(update_sched_domains, 0);
}
#else
void __init sched_init_smp(void)
@@ -4656,20 +4710,18 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
/* Set up an initial dummy domain for early boot */
- static struct sched_domain sched_domain_init;
- static struct sched_group sched_group_init;
- memset(&sched_domain_init, 0, sizeof(struct sched_domain));
- sched_domain_init.span = CPU_MASK_ALL;
- sched_domain_init.groups = &sched_group_init;
- sched_domain_init.last_balance = jiffies;
- sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
- sched_domain_init.busy_factor = 1;
-
- memset(&sched_group_init, 0, sizeof(struct sched_group));
- sched_group_init.cpumask = CPU_MASK_ALL;
- sched_group_init.next = &sched_group_init;
- sched_group_init.cpu_power = SCHED_LOAD_SCALE;
+ memset(&sched_domain_dummy, 0, sizeof(struct sched_domain));
+ sched_domain_dummy.span = CPU_MASK_ALL;
+ sched_domain_dummy.groups = &sched_group_dummy;
+ sched_domain_dummy.last_balance = jiffies;
+ sched_domain_dummy.balance_interval = INT_MAX; /* Don't balance */
+ sched_domain_dummy.busy_factor = 1;
+
+ memset(&sched_group_dummy, 0, sizeof(struct sched_group));
+ sched_group_dummy.cpumask = CPU_MASK_ALL;
+ sched_group_dummy.next = &sched_group_dummy;
+ sched_group_dummy.cpu_power = SCHED_LOAD_SCALE;
#endif
for (i = 0; i < NR_CPUS; i++) {
@@ -4682,7 +4734,7 @@ void __init sched_init(void)
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
- rq->sd = &sched_domain_init;
+ rq->sd = &sched_domain_dummy;
rq->cpu_load = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
_
[email protected] wrote:
>Register a cpu hotplug notifier which reinitializes the scheduler
>domains hierarchy. The notifier temporarily attaches all running cpus
>to a "dummy" domain (like we currently do during boot) to avoid
>balancing. It then calls arch_init_sched_domains which rebuilds the
>"real" domains and reattaches the cpus to them.
>
>Also change __init attributes to __devinit where necessary.
>
>Signed-off-by: Nathan Lynch <[email protected]>
>
Thanks Nathan, this looks great.
I think the next step is to now make the setup code only use cpu_online_map
and get rid of everywhere I had been doing cpus_and(tmp, ...,
cpu_online_map).
This may also make your patch 1/2 unnecessary? What do you think?
Nick
On Tue, 2004-09-07 at 19:44, Nick Piggin wrote:
> I think the next step is to now make the setup code only use cpu_online_map
> and get rid of everywhere I had been doing cpus_and(tmp, ...,
> cpu_online_map).
> This may also make your patch 1/2 unnecessary? What do you think?
Well, we have to "lie" to arch_init_sched_domains a little bit when
bringing a cpu online, by setting the soon-to-be-online cpu's bit in the
argument mask. So I think the first patch is still necessary.
Nathan
Nathan Lynch wrote:
>On Tue, 2004-09-07 at 19:44, Nick Piggin wrote:
>
>>I think the next step is to now make the setup code only use cpu_online_map
>>
^^^ OK I see you've already done that. Sorry I should have looked at the
patches a bit closer. Your implementation looks very nice.
>>and get rid of everywhere I had been doing cpus_and(tmp, ...,
>>cpu_online_map).
>>
^^^ This should still be done, of course. That can come later.
>>This may also make your patch 1/2 unnecessary? What do you think?
>>
>
>Well, we have to "lie" to arch_init_sched_domains a little bit when
>bringing a cpu online, by setting the soon-to-be-online cpu's bit in the
>argument mask. So I think the first patch is still necessary.
>
>
Can't we do everything in the CPU_UP_ONLINE case though?
One other thing:
void __init sched_init_smp(void)
{
+ lock_cpu_hotplug();
arch_init_sched_domains(cpu_online_map);
sched_domain_debug();
+ unlock_cpu_hotplug();
+
+ hotcpu_notifier(update_sched_domains, 0);
}
Do you have a theoretical race here? Can we hotplug a CPU before the notifier
is registered? (I know we *can't* because it is still earlyish boot).
Can you move hotcpu_notifier under the cpu_hotplug lock? Seems not because
register_cpu_notifier takes the lock itself. Seems like a flaw in the API to
me. Probably the notifier chain should be protected by a lock that nests
inside the cpucontrol lock. Rusty?
On Tue, 2004-09-07 at 21:19, Nick Piggin wrote:
> Nathan Lynch wrote:
>
> >On Tue, 2004-09-07 at 19:44, Nick Piggin wrote:
> >>and get rid of everywhere I had been doing cpus_and(tmp, ...,
> >>cpu_online_map).
> >>
>
> ^^^ This should still be done, of course. That can come later.
>
> >>This may also make your patch 1/2 unnecessary? What do you think?
> >>
> >
> >Well, we have to "lie" to arch_init_sched_domains a little bit when
> >bringing a cpu online, by setting the soon-to-be-online cpu's bit in the
> >argument mask. So I think the first patch is still necessary.
> >
> >
>
> Can't we do everything in the CPU_UP_ONLINE case though?
I guess I overlooked that possibility. I'll give it a try. If it works
ok, the first patch can go.
Nathan
On Wed, 2004-09-08 at 12:19, Nick Piggin wrote:
> Nathan Lynch wrote:
> >Well, we have to "lie" to arch_init_sched_domains a little bit when
> >bringing a cpu online, by setting the soon-to-be-online cpu's bit in the
> >argument mask. So I think the first patch is still necessary.
I'm still a little surprised that you don't change the domains while the
machine is frozen (ie in take_cpu_down()). This should avoid any races,
since noone can be looking at the domains at this time.
> Do you have a theoretical race here? Can we hotplug a CPU before the notifier
> is registered? (I know we *can't* because it is still earlyish boot).
No, init has so many serial assumptions that this is the least of our
worries.
Rusty.
--
Anyone who quotes me in their signature is an idiot -- Rusty Russell
Rusty Russell wrote:
> On Wed, 2004-09-08 at 12:19, Nick Piggin wrote:
>>Do you have a theoretical race here? Can we hotplug a CPU before the notifier
>>is registered? (I know we *can't* because it is still earlyish boot).
>
>
> No, init has so many serial assumptions that this is the least of our
> worries.
>
No. But the point is, you cannot (easily) set something up like this:
"do some setup with this specific, valid cpu_online_map";
"register notifier so we can keep everything valid";
without a race between them. Protecting the notifier chain under a
different lock is the trivial fix. I'll send the patch if you'd
like?