LinuxLists.cc - [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU

2004-04-05 12:17:48

Subject: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Hi Rusty,
migrate_all_tasks is currently run with rest of the machine stopped.
It iterates thr' the complete task table, turning off cpu affinity of any task
that it finds affine to the dying cpu. Depending on the task table
size this can take considerable time. All this time machine is stopped, doing
nothing.

I think Nick was working on reducing this time spent in migrating tasks
by concentrating only on the tasks in the runqueue and catch up with sleeping
tasks as and when they wake up (in try_to_wake_up). But this still can be
considerable time spent depending on the number of tasks in the dying CPU's
runqueue.

Other solutions that we thought of were put off by either changes required
in hot path (schedule for ex) or the complexity of implementation.

I was tempted to think of an alternate _simple_ solution that :

a. avoids doing migrate tasks with machine stopped
b. avoids any changes in core code (especially hot path)

Point a) leads to a more scalable solution where amout of time machine
is stopped is _independent_ of runqueue length or task table length.

The solution that I came up with (which can be shot down if you think its
not correct/good :-) which meets both the above goals was to have idle task put
to the _front_ of the dying CPU's runqueue at the highest priority
possible. This cause idle thread to run _immediately_ after kstopmachine
thread yields. Idle thread notices that its cpu is offline and
dies quickly. Task migration can then be done at leisure in CPU_DEAD
notification, when rest of the CPUs are running.

Some advantages I see with this approach are:

- More scalable. Predicatable amout of time that machine is stopped.
- No changes to core code. We are just exploiting scheduler rules
which runs the next high-priority task on the runqueue. Also
since I put idle task to the _front_ of the runqueue, there
are no races when a equally high priority task is woken up
and added to the runqueue. It gets in at the back of the runqueue,
_after_ idle task!
- No changes in hot path/core code (schedule/try_to_wake_up)
- cpu_is_offline check that is presenty required in load_balance
can be removed, thus speeding it up a bit
- Probably the cpu_is_offline check that is present in try_to_wake_up
(sync case) can also be removed ..I dont think that check is required
even w/o this patch?

Patch below is (stress) tested on 4-way PPC64 box against 2.6.5-ames. To
test it, you probably need to also apply the "fix __cpu_up race" patch that I
posted earlier.

Pls let me know if this topic is worth pursuing!

---

linux-2.6.5-ames-vatsa/include/linux/sched.h | 5 -
linux-2.6.5-ames-vatsa/kernel/cpu.c | 28 +++++-
linux-2.6.5-ames-vatsa/kernel/sched.c | 109 ++++++++++++++++++++-------
3 files changed, 109 insertions(+), 33 deletions(-)

diff -puN include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD include/linux/sched.h
--- linux-2.6.5-ames/include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD 2004-04-05 16:44:50.000000000 +0530
+++ linux-2.6.5-ames-vatsa/include/linux/sched.h 2004-04-05 16:46:01.000000000 +0530
@@ -549,8 +549,9 @@ extern void node_nr_running_init(void);
#define node_nr_running_init() {}
#endif

-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+/* Move tasks off dead CPU onto another. */
+extern void migrate_all_tasks(int cpu);
+extern void sched_idle_next(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
diff -puN kernel/sched.c~migrate_all_tasks_in_CPU_DEAD kernel/sched.c
--- linux-2.6.5-ames/kernel/sched.c~migrate_all_tasks_in_CPU_DEAD 2004-04-05 16:44:50.000000000 +0530
+++ linux-2.6.5-ames-vatsa/kernel/sched.c 2004-04-05 16:45:12.000000000 +0530
@@ -342,6 +342,15 @@ static inline void enqueue_task(struct t
p->array = array;
}

+/* Add task at the _front_ of it's priority queue - Used by CPU offline code */
+static inline void __enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+ list_add(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ array->nr_active++;
+ p->array = array;
+}
+
/*
* effective_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
@@ -382,6 +391,15 @@ static inline void __activate_task(task_
nr_running_inc(rq);
}

+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+ __enqueue_task(p, rq->active);
+ nr_running_inc(rq);
+}
+
static void recalc_task_prio(task_t *p, unsigned long long now)
{
unsigned long long __sleep_time = now - p->timestamp;
@@ -666,8 +684,7 @@ repeat_lock_task:
if (unlikely(sync && !task_running(rq, p) &&
(task_cpu(p) != smp_processor_id()) &&
cpu_isset(smp_processor_id(),
- p->cpus_allowed) &&
- !cpu_is_offline(smp_processor_id()))) {
+ p->cpus_allowed))) {
set_task_cpu(p, smp_processor_id());
task_rq_unlock(rq, &flags);
goto repeat_lock_task;
@@ -1301,9 +1318,6 @@ static void load_balance(runqueue_t *thi
struct list_head *head, *curr;
task_t *tmp;

- if (cpu_is_offline(this_cpu))
- goto out;
-
busiest = find_busiest_queue(this_rq, this_cpu, idle,
&imbalance, cpumask);
if (!busiest)
@@ -2657,7 +2671,7 @@ void show_state(void)
read_unlock(&tasklist_lock);
}

-void __init init_idle(task_t *idle, int cpu)
+void __devinit init_idle(task_t *idle, int cpu)
{
runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
unsigned long flags;
@@ -2736,20 +2750,21 @@ out:

EXPORT_SYMBOL_GPL(set_cpus_allowed);

-/* Move (not current) task off this cpu, onto dest cpu. */
-static void move_task_away(struct task_struct *p, int dest_cpu)
+/* Move (not current) task off src cpu, onto dest cpu. */
+static void move_task_away(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest;
+ runqueue_t *rq_dest, *rq_src;

+ rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

- double_rq_lock(this_rq(), rq_dest);
- if (task_cpu(p) != smp_processor_id())
+ double_rq_lock(rq_src, rq_dest);
+ if (task_cpu(p) != src_cpu)
goto out; /* Already moved */

set_task_cpu(p, dest_cpu);
if (p->array) {
- deactivate_task(p, this_rq());
+ deactivate_task(p, rq_src);
activate_task(p, rq_dest);
if (p->prio < rq_dest->curr->prio)
resched_task(rq_dest->curr);
@@ -2757,7 +2772,7 @@ static void move_task_away(struct task_s
p->timestamp = rq_dest->timestamp_last_tick;

out:
- double_rq_unlock(this_rq(), rq_dest);
+ double_rq_unlock(rq_src, rq_dest);
}

/*
@@ -2792,7 +2807,7 @@ static int migration_thread(void * data)
list_del_init(head->next);
spin_unlock(&rq->lock);

- move_task_away(req->task,
+ move_task_away(req->task, smp_processor_id(),
any_online_cpu(req->task->cpus_allowed));
local_irq_enable();
complete(&req->done);
@@ -2801,19 +2816,17 @@ static int migration_thread(void * data)
}

#ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed. Machine is stopped. */
-void migrate_all_tasks(void)
+/* migrate_all_tasks - function to migrate all the tasks from
+ * (dead) cpu.
+ */
+void migrate_all_tasks(int cpu)
{
struct task_struct *tsk, *t;
int dest_cpu, src_cpu;
unsigned int node;

- /* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
+ src_cpu = cpu;

- /* Not required, but here for neatness. */
write_lock(&tasklist_lock);

/* watch out for per node tasks, let's stay on this node */
@@ -2850,11 +2863,40 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu);
}

- move_task_away(tsk, dest_cpu);
+ local_irq_disable();
+ move_task_away(tsk, src_cpu, dest_cpu);
+ local_irq_enable();
} while_each_thread(t, tsk);

write_unlock(&tasklist_lock);
}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+
+void sched_idle_next(void)
+{
+ int cpu = smp_processor_id();
+ runqueue_t *rq = this_rq();
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(cpu));
+
+ /* Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on current cpu.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ /* Add idle task to _front_ of it's priority queue */
+ __activate_idle_task(p, rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#endif /* CONFIG_HOTPLUG_CPU */

/*
@@ -2889,11 +2931,28 @@ static int migration_call(struct notifie
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
- case CPU_DEAD:
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
- BUG_ON(cpu_rq(cpu)->nr_running != 0);
- break;
+ break;
+ case CPU_DEAD:
+ migrate_all_tasks(cpu);
+ rq = cpu_rq(cpu);
+ kthread_stop(rq->migration_thread);
+ rq->migration_thread = NULL;
+ /* Take idle task off runqueue and restore it's
+ * policy/priority
+ */
+ rq = task_rq_lock(rq->idle, &flags);
+
+ /* Call init_idle instead ?? init_idle doesn't restore the
+ * policy though for us ..
+ */
+ deactivate_task(rq->idle, rq);
+ __setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+
+ task_rq_unlock(rq, &flags);
+ BUG_ON(rq->nr_running != 0);
+ break;
#endif
}
return NOTIFY_OK;
diff -puN kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD kernel/cpu.c
--- linux-2.6.5-ames/kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD 2004-04-05 16:44:50.000000000 +0530
+++ linux-2.6.5-ames-vatsa/kernel/cpu.c 2004-04-05 16:45:12.000000000 +0530
@@ -43,13 +43,13 @@ void unregister_cpu_notifier(struct noti
EXPORT_SYMBOL(unregister_cpu_notifier);

#ifdef CONFIG_HOTPLUG_CPU
-static inline void check_for_tasks(int cpu, struct task_struct *k)
+static inline void check_for_tasks(int cpu)
{
struct task_struct *p;

write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu && p != k)
+ if (task_cpu(p) == cpu)
printk(KERN_WARNING "Task %s is on cpu %d\n",
p->comm, cpu);
}
@@ -96,8 +96,14 @@ static int take_cpu_down(void *unused)
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
- /* Everyone else gets kicked off. */
- migrate_all_tasks();
+ /* Force scheduler to switch to idle task when we yield.
+ * We expect idle task to _immediately_ notice that it's cpu
+ * is offline and die quickly.
+ *
+ * This allows us to defer calling mirate_all_tasks until
+ * CPU_DEAD notification time.
+ */
+ sched_idle_next();

return err;
}
@@ -106,6 +112,7 @@ int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
+ cpumask_t old_allowed, tmp;

if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
@@ -120,16 +127,21 @@ int cpu_down(unsigned int cpu)
goto out;
}

+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);
+
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
err = PTR_ERR(p);
- goto out;
+ goto out_allowed;
}

if (cpu_online(cpu))
goto out_thread;

- check_for_tasks(cpu, p);

/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
@@ -146,10 +158,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD)
BUG();

+ check_for_tasks(cpu);
+
cpu_run_sbin_hotplug(cpu, "offline");

out_thread:
err = kthread_stop(p);
+out_allowed:
+ set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;

_

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 00:28:58

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Srivatsa Vaddagiri wrote:
> Hi Rusty,
> migrate_all_tasks is currently run with rest of the machine stopped.
> It iterates thr' the complete task table, turning off cpu affinity of any task
> that it finds affine to the dying cpu. Depending on the task table
> size this can take considerable time. All this time machine is stopped, doing
> nothing.
>
> I think Nick was working on reducing this time spent in migrating tasks
> by concentrating only on the tasks in the runqueue and catch up with sleeping
> tasks as and when they wake up (in try_to_wake_up). But this still can be
> considerable time spent depending on the number of tasks in the dying CPU's
> runqueue.

Hi Srivatsa,
First of all, if you're proposing this stuff for inclusion, you
should port it to the -mm tree, because I don't think Andrew
will want any other scheduler work going in just now. It wouldn't
be too hard.

I think my stuff is a bit orthogonal to what you're attempting.
And they should probably work well together. My "lazy migrate"
patch means the tasklist lock does not need to be held at all,
only the dying runqueue's lock.

2004-04-06 01:15:07

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:
> First of all, if you're proposing this stuff for inclusion, you
> should port it to the -mm tree, because I don't think Andrew
> will want any other scheduler work going in just now. It wouldn't
> be too hard.

Will send out today a patch against latest -mm tree!

> I think my stuff is a bit orthogonal to what you're attempting.
> And they should probably work well together. My "lazy migrate"
> patch means the tasklist lock does not need to be held at all,
> only the dying runqueue's lock.

Is there some place where I can download your patch (or is it in -mm tree)?

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 01:27:47

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Srivatsa Vaddagiri wrote:
> On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:
>
>>First of all, if you're proposing this stuff for inclusion, you
>>should port it to the -mm tree, because I don't think Andrew
>>will want any other scheduler work going in just now. It wouldn't
>>be too hard.
>
>
> Will send out today a patch against latest -mm tree!
>
>
>>I think my stuff is a bit orthogonal to what you're attempting.
>>And they should probably work well together. My "lazy migrate"
>>patch means the tasklist lock does not need to be held at all,
>>only the dying runqueue's lock.
>
>
> Is there some place where I can download your patch (or is it in -mm tree)?
>
>

I have attached it (against 2.6.5-mm1). I haven't actually tested it
yet because I haven't got around to finding and using the i386 test
code yet.

It also contains a copule of cleanups:
rename double_lock_balance to second_rq_lock, and make migrate_all_tasks
static, and have the hotplug code call sched_cpu_stop.

Comments would be welcome.

Nick

Attachments:

hotplugcpu-lazy-migrate.patch (9.53 kB)

2004-04-06 01:31:11

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Nick Piggin wrote:
> Srivatsa Vaddagiri wrote:
>
>> On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:
>>
>>> First of all, if you're proposing this stuff for inclusion, you
>>> should port it to the -mm tree, because I don't think Andrew
>>> will want any other scheduler work going in just now. It wouldn't
>>> be too hard.
>>
>>
>>
>> Will send out today a patch against latest -mm tree!
>>
>>
>>> I think my stuff is a bit orthogonal to what you're attempting.
>>> And they should probably work well together. My "lazy migrate"
>>> patch means the tasklist lock does not need to be held at all,
>>> only the dying runqueue's lock.
>>
>>
>>
>> Is there some place where I can download your patch (or is it in -mm
>> tree)?
>>
>>
>
> I have attached it (against 2.6.5-mm1). I haven't actually tested it
> #ifdef CONFIG_SMP
> - if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
> + if (unlikely(task_running(rq, p))
> goto out_activate;
>
> new_cpu = cpu;
>
> +#ifdef CONFIG_HOTPLUG_CPU
> + if (unlikely(cpu_is_offline(cpu))) {
> + /* Must lazy-migrate off this CPU */
> + goto out_set_cpu;
> + }
> +#endif
> +

Err, that should be:
#ifdef CONFIG_HOTPULG_CPU
if (unlikely(cpu_is_offline(cpu))) {
/* Must lazy-migrate off this CPU */
new_cpu = go_away(p);
goto out_set_cpu;
}
#endif

2004-04-06 07:25:33

by Ingo Molnar

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

* Srivatsa Vaddagiri <[email protected]> wrote:

> Hi Rusty,
> migrate_all_tasks is currently run with rest of the machine
> stopped. It iterates thr' the complete task table, turning off cpu
> affinity of any task that it finds affine to the dying cpu. Depending
> on the task table size this can take considerable time. All this time
> machine is stopped, doing nothing.

this is being done to keep things simple and fast - to avoid the
special-cases (hotplug hooks) in various bits of scheduler code.

> The solution that I came up with (which can be shot down if you think
> its not correct/good :-) which meets both the above goals was to have
> idle task put to the _front_ of the dying CPU's runqueue at the
> highest priority possible. This cause idle thread to run _immediately_
> after kstopmachine thread yields. Idle thread notices that its cpu is
> offline and dies quickly. Task migration can then be done at leisure
> in CPU_DEAD notification, when rest of the CPUs are running.

nice. The best thing is that your patch also removes a special-case from
the hot path.

> +/* Add task at the _front_ of it's priority queue - Used by CPU offline code */
> +static inline void __enqueue_task(struct task_struct *p, prio_array_t *array)

btw., there's now an enqueue_task_head() function in the -mm scheduler
[to do cache-cold migration], if you build ontop of -mm you'll have one
less infrastructure bit to worry about.

the question is, how much actual latency does the current 'freeze
everything' solution cause? We should prefer simplicity and
debuggability over cleverness of implementation - it's not like we'll
have hotplug systems on everyone's desk in the next year or so.

also, even assuming a hotplug CPU system, CPU replacement events are not
that common, so the performance of the CPU-down op should not be a big
issue. The function depends on the # of tasks only linearly, and we have
tons of other code that is linear on the # of tasks - in fact we just
finished removing all the quadratic functions.

Ingo

2004-04-06 08:36:30

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:
> I think my stuff is a bit orthogonal to what you're attempting.
> And they should probably work well together. My "lazy migrate"
> patch means the tasklist lock does not need to be held at all,
> only the dying runqueue's lock.

Hi Nick,
I went thr' your patch and have some comments:

1. The benefit I see in your patch (over the solution present today)
is you migrate immediately only tasks in the runqueue and don't bother abt
sleeping tasks. You catch up with them as and when they wake up.

However by doing so, are we not adding an overhead in the wake-up
path? CPU offline should be a (very) rare event and to support that we
have to check a cpu's offline status _every_ wakeup call.

IMHO it is best if we migrate _all_ tasks in one shot during the
rare offline event and thus avoid the necessity of cpu_is_offline check
during the (more) hotter wake_up path.

2. Also note that, migrate_all_tasks is being currently run with
rest of the machine frozen. So holding/not-holding tasklist
lock during that period does not make a difference!

My patch avoids having to migrate _immediately_ even the tasks present
in the runqueue. So the amout of time machine is frozen is greatly
reduced.

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 09:26:13

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Srivatsa Vaddagiri wrote:
> On Tue, Apr 06, 2004 at 10:28:53AM +1000, Nick Piggin wrote:
>
>>I think my stuff is a bit orthogonal to what you're attempting.
>>And they should probably work well together. My "lazy migrate"
>>patch means the tasklist lock does not need to be held at all,
>>only the dying runqueue's lock.
>
>
> Hi Nick,
> I went thr' your patch and have some comments:
>

Hi, thanks for the comments.

> 1. The benefit I see in your patch (over the solution present today)
> is you migrate immediately only tasks in the runqueue and don't bother abt
> sleeping tasks. You catch up with them as and when they wake up.
>

That is correct, yes.

> However by doing so, are we not adding an overhead in the wake-up
> path? CPU offline should be a (very) rare event and to support that we
> have to check a cpu's offline status _every_ wakeup call.
>

Note there was already that check in the wakeup path, although
I suspect it was someone being overly cautious and isn't required.

Also in my patch, the offline check should probably be done below
the check for if (cpu == this_cpu... because that should be a common
route.

> IMHO it is best if we migrate _all_ tasks in one shot during the
> rare offline event and thus avoid the necessity of cpu_is_offline check
> during the (more) hotter wake_up path.
>

OK, whatever you like. At least you have my alternative if
you ever run into a problem here.

> 2. Also note that, migrate_all_tasks is being currently run with
> rest of the machine frozen. So holding/not-holding tasklist
> lock during that period does not make a difference!
>

Yeah, so Rusty pointed out. It still potentially moves a lot fewer
tasks though, but I guess it was really waiting for a patch like
yours ;)

> My patch avoids having to migrate _immediately_ even the tasks present
> in the runqueue. So the amout of time machine is frozen is greatly
> reduced.
>

I really don't have much interest in the code so it is up to you.
If you ever have some realtime system that does cpu hotplugging
then give me a call!

Nick

2004-04-06 14:53:34

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 09:25:43AM +0200, Ingo Molnar wrote:
> the question is, how much actual latency does the current 'freeze
> everything' solution cause? We should prefer simplicity and debuggability
> over cleverness of implementation - it's not like we'll have hotplug systems
> on everyone's desk in the next year or so.
>
> also, even assuming a hotplug CPU system, CPU replacement events are not
> that common, so the performance of the CPU-down op should not be a big
> issue. The function depends on the # of tasks only linearly, and we have
> tons of other code that is linear on the # of tasks - in fact we just
> finished removing all the quadratic functions.

Ingo,
I obtained some latency measurements of migrate_all_tasks() on
a 4-way 1.2 GHz Power4 PPC64 (p630) box. They are as below:

Number of Tasks Cycles (get_cycles) spent in migrate_all_tasks (ms)
===========================================================================

10536 803244 (5.3 ms)
30072 2587940 (17 ms)

Extending this to 100000 tasks makes the stoppage time to be for
8 million cycles (~50 ms).

My main concern of stopping the machine for so much time
was not performance, rather the effect it may have on functioning of the
system. The fact that we freeze the machine for (possibly) tons of
cycles doing nothing but migration made me uncomfortable. _and_ the fact that
it can very well be avoided :)

Can we rule out any side effects because of this stoppage? Watchdog timers,
cluster heartbeats, jiffies, ..? Not sure ..

It just felt much more "safe" and efficient to delegate migration to more
safer time in CPU_DEAD notification, when rest of the machine is running.
Plus this avoids the cpu_is_offline check in the more hotter path
(load_balance/try_to_wake_up)!!

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 14:55:34

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 07:26:06PM +1000, Nick Piggin wrote:
> Also in my patch, the offline check should probably be done below
> the check for if (cpu == this_cpu... because that should be a common
> route.

Will this be true for wakeups which are triggered from
expiring timers also? The timers on the dead CPU are migrated to other CPUs.
When they fire, the timer fn runs on a different CPU and can try to wake up a
task 'n add it to dead cpu! So we probably need a unconditional cpu_is_offline
check in try_to_wake_up?

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 15:02:13

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 09:25:43AM +0200, Ingo Molnar wrote:
> We should prefer simplicity and debuggability over cleverness of
> implementation - it's not like we'll
> have hotplug systems on everyone's desk in the next year or so.

I felt that adding idle task to front of runqueue and thereby avoid
the stop-machine overhead to a great extent is simple! Is there any
complications you see arising out of this?

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 15:04:39

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Srivatsa Vaddagiri wrote:
> On Tue, Apr 06, 2004 at 07:26:06PM +1000, Nick Piggin wrote:
>
>>Also in my patch, the offline check should probably be done below
>>the check for if (cpu == this_cpu... because that should be a common
>>route.
>
>
> Will this be true for wakeups which are triggered from
> expiring timers also? The timers on the dead CPU are migrated to other CPUs.
> When they fire, the timer fn runs on a different CPU and can try to wake up a
> task 'n add it to dead cpu! So we probably need a unconditional cpu_is_offline
> check in try_to_wake_up?
>

AFAIKS, no.

If this happens before migrate_all_tasks, there shouldn't be a
problem because migrate_all_tasks will move the woken task anyway.

It can't happen after migrate_all_tasks, because there is nothing
on the offline CPU to be woken up.

If you do need the check there, then my lazy migrate method is
unquestionably better, because this is the only thing it would
otherwise have to add to a fastpath. Right?

2004-04-06 15:20:02

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Wed, Apr 07, 2004 at 01:04:10AM +1000, Nick Piggin wrote:
> AFAIKS, no.
>
> If this happens before migrate_all_tasks, there shouldn't be a
> problem because migrate_all_tasks will move the woken task anyway.
>
> It can't happen after migrate_all_tasks, because there is nothing
> on the offline CPU to be woken up.

Hmm ..I was thinking of this scenario ..Lets say task A uses
schedule_timeout on CPU3 :

schedule_timeout(10ms);

A timer is added in CPU3 meant to fire after (max) 10 ms.
The task is then put to sleep.

During this sleep duration, CPU3 can go down. migrate_all_tasks
not finding A in the runqueue won't bother abt it.

As pary of CPU_DEAD processing, migrate_timers will move the timer
that was added in CPU3 to CPU2 (say).

After 10 ms, when the timer fires on CPU2, it will do a wakeup on
Task A. At that point, won't Task A still be affine to CPU3? Won't
try_to_wake_up attempt adding it to CPU3? At that point 'this_cpu'
is 2 and 'cpu' is 3 (in try_to_wake_up)?

> If you do need the check there, then my lazy migrate method is
> unquestionably better, because this is the only thing it would
> otherwise have to add to a fastpath. Right?

I don't think we strictly need the cpu_is_offline check in try_to_wake_up
if we were to migrate _all_ (running 'n sleeping) tasks in one shot
(with tasklist lock held) when a CPU goes down :-)

Sorry I did not mean to compare our patches like this, just trying to work
out which will be the right thing to do!!

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-06 16:43:47

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [lhcs-devel] Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, Apr 06, 2004 at 06:45:08AM +0530, Srivatsa Vaddagiri wrote:
> Will send out today a patch against latest -mm tree!

And here's the patch against 2.6.5-mm1 (did some minimal testing on
4-way Pentium Box - Will run stress tests tomorrow and update the
patch if necessary!)

---

linux-2.6.5-mm1-vatsa/include/linux/sched.h | 3
linux-2.6.5-mm1-vatsa/kernel/cpu.c | 29 ++++++--
linux-2.6.5-mm1-vatsa/kernel/sched.c | 92 +++++++++++++++++++++-------
3 files changed, 92 insertions(+), 32 deletions(-)

diff -puN include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD include/linux/sched.h
--- linux-2.6.5-mm1/include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD 2004-04-06 22:04:27.000000000 +0530
+++ linux-2.6.5-mm1-vatsa/include/linux/sched.h 2004-04-06 22:04:44.000000000 +0530
@@ -664,8 +664,7 @@ extern void sched_balance_exec(void);
#define sched_balance_exec() {}
#endif

-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+extern void sched_idle_next(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
diff -puN kernel/sched.c~migrate_all_tasks_in_CPU_DEAD kernel/sched.c
--- linux-2.6.5-mm1/kernel/sched.c~migrate_all_tasks_in_CPU_DEAD 2004-04-06 22:04:27.000000000 +0530
+++ linux-2.6.5-mm1-vatsa/kernel/sched.c 2004-04-06 22:05:16.000000000 +0530
@@ -385,6 +385,15 @@ static inline void __activate_task(task_
rq->nr_running++;
}

+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task_head(p, rq->active);
+ rq->nr_running++;
+}
+
static void recalc_task_prio(task_t *p, unsigned long long now)
{
unsigned long long __sleep_time = now - p->timestamp;
@@ -748,7 +757,7 @@ static int try_to_wake_up(task_t * p, un
this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
+ if (unlikely(task_running(rq, p)))
goto out_activate;

new_cpu = cpu;
@@ -1681,9 +1690,6 @@ static inline void idle_balance(int this
{
struct sched_domain *sd;

- if (unlikely(cpu_is_offline(this_cpu)))
- return;
-
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
if (load_balance_newidle(this_cpu, this_rq, sd)) {
@@ -1771,9 +1777,6 @@ static void rebalance_tick(int this_cpu,
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;

- if (unlikely(cpu_is_offline(this_cpu)))
- return;
-
/* Update our load */
old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -3222,15 +3225,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
-static void __migrate_task(struct task_struct *p, int dest_cpu)
+static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest;
+ runqueue_t *rq_dest, *rq_src;

+ rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

- double_rq_lock(this_rq(), rq_dest);
+ double_rq_lock(rq_src, rq_dest);
/* Already moved. */
- if (task_cpu(p) != smp_processor_id())
+ if (task_cpu(p) != src_cpu)
goto out;
/* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed))
@@ -3238,7 +3242,7 @@ static void __migrate_task(struct task_s

set_task_cpu(p, dest_cpu);
if (p->array) {
- deactivate_task(p, this_rq());
+ deactivate_task(p, rq_src);
activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
@@ -3246,7 +3250,7 @@ static void __migrate_task(struct task_s
p->timestamp = rq_dest->timestamp_last_tick;

out:
- double_rq_unlock(this_rq(), rq_dest);
+ double_rq_unlock(rq_src, rq_dest);
}

/*
@@ -3289,7 +3293,7 @@ static int migration_thread(void * data)
spin_unlock(&rq->lock);

if (req->type == REQ_MOVE_TASK) {
- __migrate_task(req->task, req->dest_cpu);
+ __migrate_task(req->task, smp_processor_id(), req->dest_cpu);
} else if (req->type == REQ_SET_DOMAIN) {
rq->sd = req->sd;
} else {
@@ -3304,19 +3308,16 @@ static int migration_thread(void * data)
}

#ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed. Machine is stopped. */
-void migrate_all_tasks(void)
+/* migrate_all_tasks - function to migrate all the tasks from the dead cpu. */
+static void migrate_all_tasks(int cpu)
{
struct task_struct *tsk, *t;
int dest_cpu, src_cpu;
unsigned int node;

/* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
+ src_cpu = cpu;

- /* Not required, but here for neatness. */
write_lock(&tasklist_lock);

/* watch out for per node tasks, let's stay on this node */
@@ -3353,11 +3354,39 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu);
}

- __migrate_task(tsk, dest_cpu);
+ local_irq_disable();
+ __migrate_task(tsk, src_cpu, dest_cpu);
+ local_irq_enable();
} while_each_thread(t, tsk);

write_unlock(&tasklist_lock);
}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+ int cpu = smp_processor_id();
+ runqueue_t *rq = this_rq();
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(cpu));
+
+ /* Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on current cpu.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ /* Add idle task to _front_ of it's priority queue */
+ __activate_idle_task(p, rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#endif /* CONFIG_HOTPLUG_CPU */

/*
@@ -3392,10 +3421,27 @@ static int migration_call(struct notifie
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
- case CPU_DEAD:
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
- BUG_ON(cpu_rq(cpu)->nr_running != 0);
+ break;
+ case CPU_DEAD:
+ migrate_all_tasks(cpu);
+ rq = cpu_rq(cpu);
+ kthread_stop(rq->migration_thread);
+ rq->migration_thread = NULL;
+ /* Take idle task off runqueue and restore it's
+ * policy/priority
+ */
+ rq = task_rq_lock(rq->idle, &flags);
+
+ /* Call init_idle instead ?? init_idle doesn't restore the
+ * policy though for us ..
+ */
+ deactivate_task(rq->idle, rq);
+ __setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+
+ task_rq_unlock(rq, &flags);
+ BUG_ON(rq->nr_running != 0);
break;
#endif
}
diff -puN kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD kernel/cpu.c
--- linux-2.6.5-mm1/kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD 2004-04-06 22:04:27.000000000 +0530
+++ linux-2.6.5-mm1-vatsa/kernel/cpu.c 2004-04-06 22:05:04.000000000 +0530
@@ -43,13 +43,13 @@ void unregister_cpu_notifier(struct noti
EXPORT_SYMBOL(unregister_cpu_notifier);

#ifdef CONFIG_HOTPLUG_CPU
-static inline void check_for_tasks(int cpu, struct task_struct *k)
+static inline void check_for_tasks(int cpu)
{
struct task_struct *p;

write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu && p != k)
+ if (task_cpu(p) == cpu)
printk(KERN_WARNING "Task %s is on cpu %d\n",
p->comm, cpu);
}
@@ -96,8 +96,14 @@ static int take_cpu_down(void *unused)
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
- /* Everyone else gets kicked off. */
- migrate_all_tasks();
+ /* Force scheduler to switch to idle task when we yield.
+ * We expect idle task to _immediately_ notice that it's cpu
+ * is offline and die quickly.
+ *
+ * This allows us to defer calling mirate_all_tasks until
+ * CPU_DEAD notification time.
+ */
+ sched_idle_next();

return err;
}
@@ -106,6 +112,7 @@ int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
+ cpumask_t old_allowed, tmp;

if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
@@ -120,17 +127,21 @@ int cpu_down(unsigned int cpu)
goto out;
}

+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);
+
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
err = PTR_ERR(p);
- goto out;
+ goto out_allowed;
}

if (cpu_online(cpu))
goto out_thread;

- check_for_tasks(cpu, p);
-
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
@@ -146,10 +157,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD)
BUG();

+ check_for_tasks(cpu);
+
cpu_run_sbin_hotplug(cpu, "offline");

out_thread:
err = kthread_stop(p);
+out_allowed:
+ set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;

_

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-07 03:55:22

by Rusty Russell

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Tue, 2004-04-06 at 19:26, Nick Piggin wrote:
> Note there was already that check in the wakeup path, although
> I suspect it was someone being overly cautious and isn't required.

No, it really is required.

The stop_machine thread runs on the cpu, then kicks everyone else off,
then does a complete() (in stop_machine.c:do_stop()). Without this
check, the complete() drags the sleeping process (which called
stop_machine) onto the dying CPU.

Hmm, maybe we could explicitly exclude downed cpu from calling task's
mask. Kind of hacky: theoretically you should never modify a task's
affinity unless the user actually asked for it (since anyone can ask for
another tasks affinity).

Cheers!
Rusty.
--
Anyone who quotes me in their signature is an idiot -- Rusty Russell

2004-04-07 04:11:11

by Nick Piggin

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

Rusty Russell wrote:
> On Tue, 2004-04-06 at 19:26, Nick Piggin wrote:
>
>>Note there was already that check in the wakeup path, although
>>I suspect it was someone being overly cautious and isn't required.
>
>
> No, it really is required.
>
> The stop_machine thread runs on the cpu, then kicks everyone else off,
> then does a complete() (in stop_machine.c:do_stop()). Without this
> check, the complete() drags the sleeping process (which called
> stop_machine) onto the dying CPU.
>
> Hmm, maybe we could explicitly exclude downed cpu from calling task's
> mask. Kind of hacky: theoretically you should never modify a task's
> affinity unless the user actually asked for it (since anyone can ask for
> another tasks affinity).
>

Ahh yeah ok. Well it seems this isn't required with Srivatsa's
patch though, although it would be required again if my lazy
migrate patch were to go on top of that.

In that case, I'm happy with moving migrate_all_tasks to
CPU_DEAD handling, and keeping the lazy migrate patch out of
the tree.

2004-04-07 05:00:33

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Wed, Apr 07, 2004 at 01:54:34PM +1000, Rusty Russell wrote:
> No, it really is required.
>
> The stop_machine thread runs on the cpu, then kicks everyone else off,
> then does a complete() (in stop_machine.c:do_stop()). Without this
> check, the complete() drags the sleeping process (which called
> stop_machine) onto the dying CPU.

Precisely. That's why I ended up adding this in cpu_down!!

+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);

I restore the mask though (under covers of lock_cpu_hotplug) before
returning from cpu_down. Task should never see this violated affinity.

Rusty,
What do you think abt the whole patch? It has withstood
my stress-test harness :-)

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-07 05:32:57

by Rusty Russell

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Wed, 2004-04-07 at 15:01, Srivatsa Vaddagiri wrote:
> I restore the mask though (under covers of lock_cpu_hotplug) before
> returning from cpu_down. Task should never see this violated affinity.

But other tasks can do a getaffinity() on it and see the wrong affinity.
Probably not a big issue.

> Rusty,
> What do you think abt the whole patch? It has withstood
> my stress-test harness :-)

I agree with Ingo: it's clever, well done. Minor nitpicks:

+void migrate_all_tasks(int cpu)
{
struct task_struct *tsk, *t;
int dest_cpu, src_cpu;
unsigned int node;

- /* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
+ src_cpu = cpu;

Just make the parameter name "src_cpu"?

+ /* Take idle task off runqueue and restore it's
+ * policy/priority
+ */
+ rq = task_rq_lock(rq->idle, &flags);
+
+ /* Call init_idle instead ?? init_idle doesn't restore
the
+ * policy though for us ..
+ */
+ deactivate_task(rq->idle, rq);
+ __setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+
+ task_rq_unlock(rq, &flags);

One-line comments and compact code good:

/* Idle task back to normal (off runqueue, low prio) */
rq = task_rq_lock(rq->idle, &flags);
deactivate_task(rq->idle, rq);
__setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
task_rq_unlock(rq, &flags);

+ /* Force scheduler to switch to idle task when we yield.
+ * We expect idle task to _immediately_ notice that it's
cpu
+ * is offline and die quickly.
+ *
+ * This allows us to defer calling mirate_all_tasks
until
+ * CPU_DEAD notification time.
+ */
+ sched_idle_next();

This comment's very big. They don't need to know all the things we
don't do. I'd prefer:

/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */

I'm happy for this to go in..
Rusty
--
Anyone who quotes me in their signature is an idiot -- Rusty Russell

2004-04-07 14:16:35

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Wed, Apr 07, 2004 at 03:32:12PM +1000, Rusty Russell wrote:
> But other tasks can do a getaffinity() on it and see the wrong affinity.
> Probably not a big issue.

hmm .. the fact that getaffinity reads the cpus_allowed mask w/o doing
lock_cpu_hotplug makes it already racy wrt setaffinity?

Maybe it needs to take CPU hotplug sem before it reads the mask?

> I agree with Ingo: it's clever, well done. Minor nitpicks:
>
> +void migrate_all_tasks(int cpu)
> {
> struct task_struct *tsk, *t;
> int dest_cpu, src_cpu;
> unsigned int node;
>
> - /* We're nailed to this CPU. */
> - src_cpu = smp_processor_id();
> + src_cpu = cpu;
>
> Just make the parameter name "src_cpu"?

[snip]

> This comment's very big. They don't need to know all the things we
> don't do. I'd prefer:
>
> /* Force idle task to run as soon as we yield: it should
> immediately notice cpu is offline and die quickly. */

Sure, I will change as per your comments.

I would like to run my stress tests for longer time before I send it
for inclusion (i would be on vacation till next tuesday ..so maybe i will send
in the patch after that!)

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017

2004-04-07 22:57:28

by Rusty Russell

[permalink] [raw]

Subject: Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Thu, 2004-04-08 at 00:17, Srivatsa Vaddagiri wrote:
> On Wed, Apr 07, 2004 at 03:32:12PM +1000, Rusty Russell wrote:
> > But other tasks can do a getaffinity() on it and see the wrong affinity.
> > Probably not a big issue.
>
> hmm .. the fact that getaffinity reads the cpus_allowed mask w/o doing
> lock_cpu_hotplug makes it already racy wrt setaffinity?

But that's OK: that's a user race. It's like reading a file at the same
time as writing it.

> Maybe it needs to take CPU hotplug sem before it reads the mask?

Yes, taking it in both syscalls would work, too.

> I would like to run my stress tests for longer time before I send it
> for inclusion (i would be on vacation till next tuesday ..so maybe i will send
> in the patch after that!)

Thanks. BTW, can you take a look at the FIXME: in xics.c and change it
to be dynamic. Joel is having trouble proving it's a problem, and yet
Anton assures me that ioremap is needed, so CPUs not present at boot
which are brought in should be failing...

Thanks!
Rusty.
--
Anyone who quotes me in their signature is an idiot -- Rusty Russell

2004-04-12 16:08:48

by Srivatsa Vaddagiri

[permalink] [raw]

Subject: Re: [lhcs-devel] Re: [Experimental CPU Hotplug PATCH] - Move migrate_all_tasks to CPU_DEAD handling

On Wed, Apr 07, 2004 at 07:47:21PM +0530, Srivatsa Vaddagiri wrote:
> I would like to run my stress tests for longer time before I send it
> for inclusion

I had kept my stress tests running over the weekend and here's an
updated patch. Changes since last time:

- Register scheduler's callback at highest priority.
Task migration needs to happen before anything else.
(I have also been running with timer/softirq callbacks
at lowest prio of -10).

- Do write_lock_irq on tasklist_lock (migrate_all_tasks) instead of
just write_lock. Protection against any code (signal handling?) that
can attempt to take a read lock in interrupt context.

- In preempption case, there is a narrow window where check_all_tasks
will warn of a task still bound to dead cpu. That task happens
to be a newly created child. copy_process created the new task
structure for the child and initialized it, but before it could be
added in the task list it got preempted. migrate_all_tasks won't
find it. However check_all_tasks _can_ find it and warn if the
parent has not done wake_up_forked_process yet on the child. This
is a false warning and hence added some logic not to warn
in this special case (although I don't think the logic is 100%
correct - comments to fix this wellcome!)

- Analyzing the above lead to another task "leak", where a task
can be woken up on a dead CPU. If CLONE_STOPPED is set, then do_fork
won't wake up the child. It will leave it in a stopped state. In such
a case, the newly created task can be affine to dead CPU and
migrate_all_tasks may not migrate it (since it didn't find it in the
task table - see copy_process preemption race explained above).
When the stopped task is continued later, it can be added to
dead cpu's runqueue (?). Fixed this special case in do_fork.

Note: I think the above task leak would have been true in the old
scenario as well where migrate_all_tasks was run with rest of m/c
frozen.

Patch against both 2.6.5-mm4 and 2.6.5-ames follows. Rusty, pls consider for
inclusion.

Name : Defer migrate_all_tasks to CPU_DEAD handling
Author : Srivatsa Vaddagiri ([email protected])
Status : Tested on 2.6.5-mm4 on a 4-way Pentium box

---

linux-2.6.5-mm4-vatsa/include/linux/sched.h | 3
linux-2.6.5-mm4-vatsa/kernel/cpu.c | 29 +++++---
linux-2.6.5-mm4-vatsa/kernel/fork.c | 6 +
linux-2.6.5-mm4-vatsa/kernel/sched.c | 94 +++++++++++++++++++---------
4 files changed, 92 insertions(+), 40 deletions(-)

diff -puN include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD include/linux/sched.h
--- linux-2.6.5-mm4/include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD 2004-04-12 16:09:29.000000000 +0530
+++ linux-2.6.5-mm4-vatsa/include/linux/sched.h 2004-04-12 15:51:22.000000000 +0530
@@ -668,8 +668,7 @@ extern void sched_balance_exec(void);
#define sched_balance_exec() {}
#endif

-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+extern void sched_idle_next(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
diff -puN kernel/sched.c~migrate_all_tasks_in_CPU_DEAD kernel/sched.c
--- linux-2.6.5-mm4/kernel/sched.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 14:17:16.000000000 +0530
+++ linux-2.6.5-mm4-vatsa/kernel/sched.c 2004-04-12 16:33:29.000000000 +0530
@@ -386,6 +386,15 @@ static inline void __activate_task(task_
rq->nr_running++;
}

+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+ enqueue_task_head(p, rq->active);
+ rq->nr_running++;
+}
+
static void recalc_task_prio(task_t *p, unsigned long long now)
{
unsigned long long __sleep_time = now - p->timestamp;
@@ -749,7 +758,7 @@ static int try_to_wake_up(task_t * p, un
this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
+ if (unlikely(task_running(rq, p)))
goto out_activate;

new_cpu = cpu;
@@ -1682,9 +1691,6 @@ static inline void idle_balance(int this
{
struct sched_domain *sd;

- if (unlikely(cpu_is_offline(this_cpu)))
- return;
-
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
if (load_balance_newidle(this_cpu, this_rq, sd)) {
@@ -1772,9 +1778,6 @@ static void rebalance_tick(int this_cpu,
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;

- if (unlikely(cpu_is_offline(this_cpu)))
- return;
-
/* Update our load */
old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
@@ -3223,15 +3226,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
-static void __migrate_task(struct task_struct *p, int dest_cpu)
+static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest;
+ runqueue_t *rq_dest, *rq_src;

+ rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

- double_rq_lock(this_rq(), rq_dest);
+ double_rq_lock(rq_src, rq_dest);
/* Already moved. */
- if (task_cpu(p) != smp_processor_id())
+ if (task_cpu(p) != src_cpu)
goto out;
/* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed))
@@ -3239,7 +3243,7 @@ static void __migrate_task(struct task_s

set_task_cpu(p, dest_cpu);
if (p->array) {
- deactivate_task(p, this_rq());
+ deactivate_task(p, rq_src);
activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
@@ -3247,7 +3251,7 @@ static void __migrate_task(struct task_s
p->timestamp = rq_dest->timestamp_last_tick;

out:
- double_rq_unlock(this_rq(), rq_dest);
+ double_rq_unlock(rq_src, rq_dest);
}

/*
@@ -3290,7 +3294,7 @@ static int migration_thread(void * data)
spin_unlock(&rq->lock);

if (req->type == REQ_MOVE_TASK) {
- __migrate_task(req->task, req->dest_cpu);
+ __migrate_task(req->task, smp_processor_id(), req->dest_cpu);
} else if (req->type == REQ_SET_DOMAIN) {
rq->sd = req->sd;
} else {
@@ -3305,20 +3309,14 @@ static int migration_thread(void * data)
}

#ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed. Machine is stopped. */
-void migrate_all_tasks(void)
+/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */
+static void migrate_all_tasks(int src_cpu)
{
struct task_struct *tsk, *t;
- int dest_cpu, src_cpu;
+ int dest_cpu;
unsigned int node;

- /* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
-
- /* Not required, but here for neatness. */
- write_lock(&tasklist_lock);
+ write_lock_irq(&tasklist_lock);

/* watch out for per node tasks, let's stay on this node */
node = cpu_to_node(src_cpu);
@@ -3354,10 +3352,36 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu);
}

- __migrate_task(tsk, dest_cpu);
+ __migrate_task(tsk, src_cpu, dest_cpu);
} while_each_thread(t, tsk);

- write_unlock(&tasklist_lock);
+ write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+void sched_idle_next(void)
+{
+ int cpu = smp_processor_id();
+ runqueue_t *rq = this_rq();
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(cpu));
+
+ /* Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on current cpu.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ /* Add idle task to _front_ of it's priority queue */
+ __activate_idle_task(p, rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
}
#endif /* CONFIG_HOTPLUG_CPU */

@@ -3393,18 +3417,32 @@ static int migration_call(struct notifie
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
- case CPU_DEAD:
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
- BUG_ON(cpu_rq(cpu)->nr_running != 0);
+ break;
+ case CPU_DEAD:
+ migrate_all_tasks(cpu);
+ rq = cpu_rq(cpu);
+ kthread_stop(rq->migration_thread);
+ rq->migration_thread = NULL;
+ /* Idle task back to normal (off runqueue, low prio) */
+ rq = task_rq_lock(rq->idle, &flags);
+ deactivate_task(rq->idle, rq);
+ __setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+ task_rq_unlock(rq, &flags);
+ BUG_ON(rq->nr_running != 0);
break;
#endif
}
return NOTIFY_OK;
}

+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before anything else.
+ */
static struct notifier_block __devinitdata migration_notifier = {
.notifier_call = migration_call,
+ .priority = 10
};

int __init migration_init(void)
diff -puN kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD kernel/cpu.c
--- linux-2.6.5-mm4/kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 14:17:16.000000000 +0530
+++ linux-2.6.5-mm4-vatsa/kernel/cpu.c 2004-04-12 21:27:43.000000000 +0530
@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct noti
EXPORT_SYMBOL(unregister_cpu_notifier);

#ifdef CONFIG_HOTPLUG_CPU
-static inline void check_for_tasks(int cpu, struct task_struct *k)
+static inline void check_for_tasks(int cpu)
{
struct task_struct *p;

write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu && p != k)
- printk(KERN_WARNING "Task %s is on cpu %d\n",
- p->comm, cpu);
+ if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+ (state = %ld, flags = %lx) \n",
+ p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused)
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
- /* Everyone else gets kicked off. */
- migrate_all_tasks();
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();

return err;
}
@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
+ cpumask_t old_allowed, tmp;

if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu)
goto out;
}

+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);
+
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
err = PTR_ERR(p);
- goto out;
+ goto out_allowed;
}

if (cpu_online(cpu))
goto out_thread;

- check_for_tasks(cpu, p);
-
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD)
BUG();

+ check_for_tasks(cpu);
+
cpu_run_sbin_hotplug(cpu, "offline");

out_thread:
err = kthread_stop(p);
+out_allowed:
+ set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;
diff -puN kernel/fork.c~migrate_all_tasks_in_CPU_DEAD kernel/fork.c
--- linux-2.6.5-mm4/kernel/fork.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 14:17:16.000000000 +0530
+++ linux-2.6.5-mm4-vatsa/kernel/fork.c 2004-04-12 15:59:01.000000000 +0530
@@ -33,6 +33,7 @@
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/cpu.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1198,8 +1199,11 @@ long do_fork(unsigned long clone_flags,

if (!(clone_flags & CLONE_STOPPED))
wake_up_forked_process(p); /* do this last */
- else
+ else {
p->state = TASK_STOPPED;
+ if (unlikely(cpu_is_offline(task_cpu(p))))
+ set_task_cpu(p, smp_processor_id());
+ }
++total_forks;

if (unlikely (trace)) {

Name : Defer migrate_all_tasks to CPU_DEAD handling
Author : Srivatsa Vaddagiri ([email protected])
Status : Tested on 2.6.5-ames on a 4-way PPC64 box (p630)

---

ameslab-vatsa/include/linux/sched.h | 3 -
ameslab-vatsa/kernel/cpu.c | 29 +++++++---
ameslab-vatsa/kernel/fork.c | 6 +-
ameslab-vatsa/kernel/sched.c | 101 ++++++++++++++++++++++++++----------
4 files changed, 101 insertions(+), 38 deletions(-)

diff -puN include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD include/linux/sched.h
--- ameslab/include/linux/sched.h~migrate_all_tasks_in_CPU_DEAD 2004-04-12 16:16:57.000000000 +0530
+++ ameslab-vatsa/include/linux/sched.h 2004-04-12 16:18:01.000000000 +0530
@@ -549,8 +549,7 @@ extern void node_nr_running_init(void);
#define node_nr_running_init() {}
#endif

-/* Move tasks off this (offline) CPU onto another. */
-extern void migrate_all_tasks(void);
+extern void sched_idle_next(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
diff -puN kernel/sched.c~migrate_all_tasks_in_CPU_DEAD kernel/sched.c
--- ameslab/kernel/sched.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 16:16:57.000000000 +0530
+++ ameslab-vatsa/kernel/sched.c 2004-04-12 21:32:20.000000000 +0530
@@ -342,6 +342,14 @@ static inline void enqueue_task(struct t
p->array = array;
}

+static inline void __enqueue_task(struct task_struct *p, prio_array_t *array)
+{
+ list_add(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ array->nr_active++;
+ p->array = array;
+}
+
/*
* effective_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
@@ -382,6 +390,15 @@ static inline void __activate_task(task_
nr_running_inc(rq);
}

+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+ __enqueue_task(p, rq->active);
+ nr_running_inc(rq);
+}
+
static void recalc_task_prio(task_t *p, unsigned long long now)
{
unsigned long long __sleep_time = now - p->timestamp;
@@ -666,8 +683,7 @@ repeat_lock_task:
if (unlikely(sync && !task_running(rq, p) &&
(task_cpu(p) != smp_processor_id()) &&
cpu_isset(smp_processor_id(),
- p->cpus_allowed) &&
- !cpu_is_offline(smp_processor_id()))) {
+ p->cpus_allowed))) {
set_task_cpu(p, smp_processor_id());
task_rq_unlock(rq, &flags);
goto repeat_lock_task;
@@ -1301,9 +1317,6 @@ static void load_balance(runqueue_t *thi
struct list_head *head, *curr;
task_t *tmp;

- if (cpu_is_offline(this_cpu))
- goto out;
-
busiest = find_busiest_queue(this_rq, this_cpu, idle,
&imbalance, cpumask);
if (!busiest)
@@ -2737,19 +2750,20 @@ out:
EXPORT_SYMBOL_GPL(set_cpus_allowed);

/* Move (not current) task off this cpu, onto dest cpu. */
-static void move_task_away(struct task_struct *p, int dest_cpu)
+static void move_task_away(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest;
+ runqueue_t *rq_dest, *rq_src;

+ rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);

- double_rq_lock(this_rq(), rq_dest);
- if (task_cpu(p) != smp_processor_id())
+ double_rq_lock(rq_src, rq_dest);
+ if (task_cpu(p) != src_cpu)
goto out; /* Already moved */

set_task_cpu(p, dest_cpu);
if (p->array) {
- deactivate_task(p, this_rq());
+ deactivate_task(p, rq_src);
activate_task(p, rq_dest);
if (p->prio < rq_dest->curr->prio)
resched_task(rq_dest->curr);
@@ -2757,7 +2771,7 @@ static void move_task_away(struct task_s
p->timestamp = rq_dest->timestamp_last_tick;

out:
- double_rq_unlock(this_rq(), rq_dest);
+ double_rq_unlock(rq_src, rq_dest);
}

/*
@@ -2792,7 +2806,7 @@ static int migration_thread(void * data)
list_del_init(head->next);
spin_unlock(&rq->lock);

- move_task_away(req->task,
+ move_task_away(req->task, smp_processor_id(),
any_online_cpu(req->task->cpus_allowed));
local_irq_enable();
complete(&req->done);
@@ -2801,20 +2815,14 @@ static int migration_thread(void * data)
}

#ifdef CONFIG_HOTPLUG_CPU
-/* migrate_all_tasks - function to migrate all the tasks from the
- * current cpu caller must have already scheduled this to the target
- * cpu via set_cpus_allowed. Machine is stopped. */
-void migrate_all_tasks(void)
+/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */
+static void migrate_all_tasks(int src_cpu)
{
struct task_struct *tsk, *t;
- int dest_cpu, src_cpu;
+ int dest_cpu;
unsigned int node;

- /* We're nailed to this CPU. */
- src_cpu = smp_processor_id();
-
- /* Not required, but here for neatness. */
- write_lock(&tasklist_lock);
+ write_lock_irq(&tasklist_lock);

/* watch out for per node tasks, let's stay on this node */
node = cpu_to_node(src_cpu);
@@ -2850,10 +2858,37 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu);
}

- move_task_away(tsk, dest_cpu);
+ move_task_away(tsk, src_cpu, dest_cpu);
} while_each_thread(t, tsk);

- write_unlock(&tasklist_lock);
+ write_unlock_irq(&tasklist_lock);
+}
+
+/* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+ */
+
+void sched_idle_next(void)
+{
+ int cpu = smp_processor_id();
+ runqueue_t *rq = this_rq();
+ struct task_struct *p = rq->idle;
+ unsigned long flags;
+
+ /* cpu has to be offline */
+ BUG_ON(cpu_online(cpu));
+
+ /* Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on current cpu.
+ */
+ spin_lock_irqsave(&rq->lock, flags);
+
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ /* Add idle task to _front_ of it's priority queue */
+ __activate_idle_task(p, rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
}
#endif /* CONFIG_HOTPLUG_CPU */

@@ -2889,18 +2924,32 @@ static int migration_call(struct notifie
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
- case CPU_DEAD:
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
- BUG_ON(cpu_rq(cpu)->nr_running != 0);
+ break;
+ case CPU_DEAD:
+ migrate_all_tasks(cpu);
+ rq = cpu_rq(cpu);
+ kthread_stop(rq->migration_thread);
+ rq->migration_thread = NULL;
+ /* Idle task back to normal (off runqueue, low prio) */
+ rq = task_rq_lock(rq->idle, &flags);
+ deactivate_task(rq->idle, rq);
+ __setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
+ task_rq_unlock(rq, &flags);
+ BUG_ON(rq->nr_running != 0);
break;
#endif
}
return NOTIFY_OK;
}

+/* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before anything else.
+ */
static struct notifier_block __devinitdata migration_notifier = {
.notifier_call = migration_call,
+ .priority = 10
};

int __init migration_init(void)
diff -puN kernel/fork.c~migrate_all_tasks_in_CPU_DEAD kernel/fork.c
--- ameslab/kernel/fork.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 16:16:57.000000000 +0530
+++ ameslab-vatsa/kernel/fork.c 2004-04-12 16:18:31.000000000 +0530
@@ -31,6 +31,7 @@
#include <linux/futex.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
+#include <linux/cpu.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1168,8 +1169,11 @@ long do_fork(unsigned long clone_flags,

if (!(clone_flags & CLONE_STOPPED))
wake_up_forked_process(p); /* do this last */
- else
+ else {
p->state = TASK_STOPPED;
+ if (unlikely(cpu_is_offline(task_cpu(p))))
+ set_task_cpu(p, smp_processor_id());
+ }
++total_forks;

if (unlikely (trace)) {
diff -puN kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD kernel/cpu.c
--- ameslab/kernel/cpu.c~migrate_all_tasks_in_CPU_DEAD 2004-04-12 16:16:57.000000000 +0530
+++ ameslab-vatsa/kernel/cpu.c 2004-04-12 16:24:58.000000000 +0530
@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct noti
EXPORT_SYMBOL(unregister_cpu_notifier);

#ifdef CONFIG_HOTPLUG_CPU
-static inline void check_for_tasks(int cpu, struct task_struct *k)
+static inline void check_for_tasks(int cpu)
{
struct task_struct *p;

write_lock_irq(&tasklist_lock);
for_each_process(p) {
- if (task_cpu(p) == cpu && p != k)
- printk(KERN_WARNING "Task %s is on cpu %d\n",
- p->comm, cpu);
+ if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+ (state = %ld, flags = %lx) \n",
+ p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused)
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
- /* Everyone else gets kicked off. */
- migrate_all_tasks();
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();

return err;
}
@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
+ cpumask_t old_allowed, tmp;

if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu)
goto out;
}

+ /* Ensure that we are not runnable on dying cpu */
+ old_allowed = current->cpus_allowed;
+ tmp = CPU_MASK_ALL;
+ cpu_clear(cpu, tmp);
+ set_cpus_allowed(current, tmp);
+
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
err = PTR_ERR(p);
- goto out;
+ goto out_allowed;
}

if (cpu_online(cpu))
goto out_thread;

- check_for_tasks(cpu, p);
-
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD)
BUG();

+ check_for_tasks(cpu);
+
cpu_run_sbin_hotplug(cpu, "offline");

out_thread:
err = kthread_stop(p);
+out_allowed:
+ set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;

--

Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017