2011-06-07 20:16:07

by Suresh Siddha

[permalink] [raw]
Subject: [patch v2 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

Currently stop machine infrastructure can be called only from a cpu that is
online. But for !CONFIG_SMP, we do allow calling __stop_machine() before the
cpu is online.

x86 for example requires stop machine infrastructure in the cpu online path
and currently implements its own stop machine (using stop_one_cpu_nowait())
for MTRR initialization in the cpu online path.

Enhance the __stop_machine() so that it can be called before the cpu
is onlined. This will pave the way for code consolidation and address potential
deadlocks caused by multiple mechanisms of doing system wide rendezvous.

This will also address the behavioral differences of __stop_machine()
between SMP and UP builds.

Signed-off-by: Suresh Siddha <[email protected]>
Cc: [email protected] # v2.6.35+
---
kernel/stop_machine.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 57 insertions(+), 5 deletions(-)

Index: linux-2.6-tip/kernel/stop_machine.c
===================================================================
--- linux-2.6-tip.orig/kernel/stop_machine.c
+++ linux-2.6-tip/kernel/stop_machine.c
@@ -136,12 +136,35 @@ void stop_one_cpu_nowait(unsigned int cp
static DEFINE_MUTEX(stop_cpus_mutex);
static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);

+/**
+ * __stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask. If @cpumask is NULL, @fn
+ * is run on all the online cpus including the current cpu (even if it
+ * is not online).
+ * On each target cpu, @fn is run in a process context (except when run on
+ * the cpu that is in the process of coming online, in which case @fn is run
+ * in the same context as __stop_cpus()) with the highest priority
+ * preempting any task on the cpu and monopolizing it. This function
+ * returns after all executions are complete.
+ */
int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
+ int online = percpu_read(cpu_stopper.enabled);
+ int include_this_offline = 0;
struct cpu_stop_work *work;
struct cpu_stop_done done;
+ unsigned int weight;
unsigned int cpu;

+ if (!cpumask) {
+ cpumask = cpu_online_mask;
+ include_this_offline = 1;
+ }
+
/* initialize works and done */
for_each_cpu(cpu, cpumask) {
work = &per_cpu(stop_cpus_work, cpu);
@@ -149,7 +172,12 @@ int __stop_cpus(const struct cpumask *cp
work->arg = arg;
work->done = &done;
}
- cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+ weight = cpumask_weight(cpumask);
+ if (!online && include_this_offline)
+ weight++;
+
+ cpu_stop_init_done(&done, weight);

/*
* Disable preemption while queueing to avoid getting
@@ -162,7 +190,20 @@ int __stop_cpus(const struct cpumask *cp
&per_cpu(stop_cpus_work, cpu));
preempt_enable();

- wait_for_completion(&done.completion);
+ if (online)
+ wait_for_completion(&done.completion);
+ else {
+ /*
+ * This cpu is not yet online. If @fn needs to be run on this
+ * cpu, run it now. Also, we can't afford to sleep here,
+ * so poll till the work is completed on all the cpu's.
+ */
+ if (include_this_offline)
+ fn(arg);
+ while (atomic_read(&done.nr_todo) > 1)
+ cpu_relax();
+ }
+
return done.executed ? done.ret : -ENOENT;
}

@@ -431,6 +472,7 @@ static int stop_machine_cpu_stop(void *d
struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
int cpu = smp_processor_id(), err = 0;
+ unsigned long flags = 0;
bool is_active;

if (!smdata->active_cpus)
@@ -446,7 +488,7 @@ static int stop_machine_cpu_stop(void *d
curstate = smdata->state;
switch (curstate) {
case STOPMACHINE_DISABLE_IRQ:
- local_irq_disable();
+ local_irq_save(flags);
hard_irq_disable();
break;
case STOPMACHINE_RUN:
@@ -460,7 +502,7 @@ static int stop_machine_cpu_stop(void *d
}
} while (curstate != STOPMACHINE_EXIT);

- local_irq_enable();
+ local_irq_restore(flags);
return err;
}

@@ -470,9 +512,19 @@ int __stop_machine(int (*fn)(void *), vo
.num_threads = num_online_cpus(),
.active_cpus = cpus };

+ /* Include the calling cpu that might not be online yet. */
+ if (!percpu_read(cpu_stopper.enabled))
+ smdata.num_threads++;
+
/* Set the initial state and stop all online cpus. */
set_state(&smdata, STOPMACHINE_PREPARE);
- return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+
+ if (percpu_read(cpu_stopper.enabled))
+ return stop_cpus(cpu_online_mask, stop_machine_cpu_stop,
+ &smdata);
+ else
+ return __stop_cpus(NULL, stop_machine_cpu_stop,
+ &smdata);
}

int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)


2011-06-08 19:21:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch v2 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path


Rusty, Tejun, what do you think about the patch below?

Thanks,

Ingo

* Suresh Siddha <[email protected]> wrote:

> Currently stop machine infrastructure can be called only from a cpu that is
> online. But for !CONFIG_SMP, we do allow calling __stop_machine() before the
> cpu is online.
>
> x86 for example requires stop machine infrastructure in the cpu online path
> and currently implements its own stop machine (using stop_one_cpu_nowait())
> for MTRR initialization in the cpu online path.
>
> Enhance the __stop_machine() so that it can be called before the cpu
> is onlined. This will pave the way for code consolidation and address potential
> deadlocks caused by multiple mechanisms of doing system wide rendezvous.
>
> This will also address the behavioral differences of __stop_machine()
> between SMP and UP builds.
>
> Signed-off-by: Suresh Siddha <[email protected]>
> Cc: [email protected] # v2.6.35+
> ---
> kernel/stop_machine.c | 62 +++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 57 insertions(+), 5 deletions(-)
>
> Index: linux-2.6-tip/kernel/stop_machine.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/stop_machine.c
> +++ linux-2.6-tip/kernel/stop_machine.c
> @@ -136,12 +136,35 @@ void stop_one_cpu_nowait(unsigned int cp
> static DEFINE_MUTEX(stop_cpus_mutex);
> static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
>
> +/**
> + * __stop_cpus - stop multiple cpus
> + * @cpumask: cpus to stop
> + * @fn: function to execute
> + * @arg: argument to @fn
> + *
> + * Execute @fn(@arg) on online cpus in @cpumask. If @cpumask is NULL, @fn
> + * is run on all the online cpus including the current cpu (even if it
> + * is not online).
> + * On each target cpu, @fn is run in a process context (except when run on
> + * the cpu that is in the process of coming online, in which case @fn is run
> + * in the same context as __stop_cpus()) with the highest priority
> + * preempting any task on the cpu and monopolizing it. This function
> + * returns after all executions are complete.
> + */
> int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> {
> + int online = percpu_read(cpu_stopper.enabled);
> + int include_this_offline = 0;
> struct cpu_stop_work *work;
> struct cpu_stop_done done;
> + unsigned int weight;
> unsigned int cpu;
>
> + if (!cpumask) {
> + cpumask = cpu_online_mask;
> + include_this_offline = 1;
> + }
> +
> /* initialize works and done */
> for_each_cpu(cpu, cpumask) {
> work = &per_cpu(stop_cpus_work, cpu);
> @@ -149,7 +172,12 @@ int __stop_cpus(const struct cpumask *cp
> work->arg = arg;
> work->done = &done;
> }
> - cpu_stop_init_done(&done, cpumask_weight(cpumask));
> +
> + weight = cpumask_weight(cpumask);
> + if (!online && include_this_offline)
> + weight++;
> +
> + cpu_stop_init_done(&done, weight);
>
> /*
> * Disable preemption while queueing to avoid getting
> @@ -162,7 +190,20 @@ int __stop_cpus(const struct cpumask *cp
> &per_cpu(stop_cpus_work, cpu));
> preempt_enable();
>
> - wait_for_completion(&done.completion);
> + if (online)
> + wait_for_completion(&done.completion);
> + else {
> + /*
> + * This cpu is not yet online. If @fn needs to be run on this
> + * cpu, run it now. Also, we can't afford to sleep here,
> + * so poll till the work is completed on all the cpu's.
> + */
> + if (include_this_offline)
> + fn(arg);
> + while (atomic_read(&done.nr_todo) > 1)
> + cpu_relax();
> + }
> +
> return done.executed ? done.ret : -ENOENT;
> }
>
> @@ -431,6 +472,7 @@ static int stop_machine_cpu_stop(void *d
> struct stop_machine_data *smdata = data;
> enum stopmachine_state curstate = STOPMACHINE_NONE;
> int cpu = smp_processor_id(), err = 0;
> + unsigned long flags = 0;
> bool is_active;
>
> if (!smdata->active_cpus)
> @@ -446,7 +488,7 @@ static int stop_machine_cpu_stop(void *d
> curstate = smdata->state;
> switch (curstate) {
> case STOPMACHINE_DISABLE_IRQ:
> - local_irq_disable();
> + local_irq_save(flags);
> hard_irq_disable();
> break;
> case STOPMACHINE_RUN:
> @@ -460,7 +502,7 @@ static int stop_machine_cpu_stop(void *d
> }
> } while (curstate != STOPMACHINE_EXIT);
>
> - local_irq_enable();
> + local_irq_restore(flags);
> return err;
> }
>
> @@ -470,9 +512,19 @@ int __stop_machine(int (*fn)(void *), vo
> .num_threads = num_online_cpus(),
> .active_cpus = cpus };
>
> + /* Include the calling cpu that might not be online yet. */
> + if (!percpu_read(cpu_stopper.enabled))
> + smdata.num_threads++;
> +
> /* Set the initial state and stop all online cpus. */
> set_state(&smdata, STOPMACHINE_PREPARE);
> - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
> +
> + if (percpu_read(cpu_stopper.enabled))
> + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop,
> + &smdata);
> + else
> + return __stop_cpus(NULL, stop_machine_cpu_stop,
> + &smdata);
> }
>
> int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
>

--
Thanks,

Ingo

2011-06-09 09:54:58

by Tejun Heo

[permalink] [raw]
Subject: Re: [patch v2 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

Hello, Suresh, Ingo.

On Tue, Jun 07, 2011 at 01:14:12PM -0700, Suresh Siddha wrote:
> Currently stop machine infrastructure can be called only from a cpu that is
> online. But for !CONFIG_SMP, we do allow calling __stop_machine() before the
> cpu is online.
>
> x86 for example requires stop machine infrastructure in the cpu online path
> and currently implements its own stop machine (using stop_one_cpu_nowait())
> for MTRR initialization in the cpu online path.
>
> Enhance the __stop_machine() so that it can be called before the cpu
> is onlined. This will pave the way for code consolidation and address potential
> deadlocks caused by multiple mechanisms of doing system wide rendezvous.
>
> This will also address the behavioral differences of __stop_machine()
> between SMP and UP builds.
>
> Signed-off-by: Suresh Siddha <[email protected]>
> Cc: [email protected] # v2.6.35+

First of all, I agree this is the correct thing to do but I don't
agree with some of the details. Also, this is slightly scary for
-stable. Maybe we should opt for something less intrusive for
-stable?

> +/**
> + * __stop_cpus - stop multiple cpus
> + * @cpumask: cpus to stop
> + * @fn: function to execute
> + * @arg: argument to @fn
> + *
> + * Execute @fn(@arg) on online cpus in @cpumask. If @cpumask is NULL, @fn
> + * is run on all the online cpus including the current cpu (even if it
> + * is not online).
> + * On each target cpu, @fn is run in a process context (except when run on
> + * the cpu that is in the process of coming online, in which case @fn is run
> + * in the same context as __stop_cpus()) with the highest priority
> + * preempting any task on the cpu and monopolizing it. This function
> + * returns after all executions are complete.
> + */

It's minor but can you please follow the comment style in the same
file? ie. Blank line between paragraphs, separate CONTEXT: and
RETURNS: sections. At the second thought, why is this function even
exported? It doesn't seem to have any out-of-file user left. Maybe
best to make it static?

> int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> {
> + int online = percpu_read(cpu_stopper.enabled);
> + int include_this_offline = 0;
> struct cpu_stop_work *work;
> struct cpu_stop_done done;
> + unsigned int weight;
> unsigned int cpu;
>
> + if (!cpumask) {
> + cpumask = cpu_online_mask;
> + include_this_offline = 1;
> + }

This seems a bit too subtle. I would much prefer if it were much more
explicit than using non-obvious combination of conditions to trigger
this special behavior.

Hmm... Wouldn't it be better to change cpu_stop_queue_work() consider
whether the local CPU is offline and then add cpu_stop_wait_done()
which does wait_for_completion() if local is online and otherwise
execute fn locally, call cpu_stop_signal_done() and wait in busy loop?

That would make the whole thing much more generic and easier to
describe. The current implementation seems quite hacky/subtle and
doesn't fit well with the rest. It would be much better if we can
just state "if used from local CPU which is not online and the target
@cpumask includes the local CPU, the work item is executed on-stack
and completion is waited in busy-loop" for all cpu_stop functions.

Also, it would be better to factor out work item execution and
completion from cpu_stopper_thread() and call that instead of invoking
fn(arg) directly.

Thank you.

--
tejun

2011-06-09 22:49:57

by Suresh Siddha

[permalink] [raw]
Subject: Re: [patch v2 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

On Thu, 2011-06-09 at 02:54 -0700, Tejun Heo wrote:
> First of all, I agree this is the correct thing to do but I don't
> agree with some of the details. Also, this is slightly scary for
> -stable. Maybe we should opt for something less intrusive for
> -stable?

Probably you noticed on the suse bug that I first proposed a simple non
intrusive patch to address this problem, before actually posting these
two complete patches. I am ok in pushing that simple patch (which is
appended below to this thread) for -stable and consider these two
patches for mainline. Ingo, HPA, if we go that route, drop the -stable
tag from the v3 version of the patches.

But I do think these two patches are not that complex and if they are
good for mainline, they should be good aswell for -stable. Anyways I am
open either way (and appended the stable only patch in the end).

>
> > +/**
> > + * __stop_cpus - stop multiple cpus
> > + * @cpumask: cpus to stop
> > + * @fn: function to execute
> > + * @arg: argument to @fn
> > + *
> > + * Execute @fn(@arg) on online cpus in @cpumask. If @cpumask is NULL, @fn
> > + * is run on all the online cpus including the current cpu (even if it
> > + * is not online).
> > + * On each target cpu, @fn is run in a process context (except when run on
> > + * the cpu that is in the process of coming online, in which case @fn is run
> > + * in the same context as __stop_cpus()) with the highest priority
> > + * preempting any task on the cpu and monopolizing it. This function
> > + * returns after all executions are complete.
> > + */
>
> It's minor but can you please follow the comment style in the same
> file? ie. Blank line between paragraphs, separate CONTEXT: and
> RETURNS: sections. At the second thought, why is this function even
> exported? It doesn't seem to have any out-of-file user left. Maybe
> best to make it static?

Made it static in v3.


> > int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> > {
> > + int online = percpu_read(cpu_stopper.enabled);
> > + int include_this_offline = 0;
> > struct cpu_stop_work *work;
> > struct cpu_stop_done done;
> > + unsigned int weight;
> > unsigned int cpu;
> >
> > + if (!cpumask) {
> > + cpumask = cpu_online_mask;
> > + include_this_offline = 1;
> > + }
>
> This seems a bit too subtle. I would much prefer if it were much more
> explicit than using non-obvious combination of conditions to trigger
> this special behavior.

I first used cpu_callout_mask when I did the patches. But that is not
available for generic code. And ended up using NULL mask.

But v3 code uses cpu_all_mask, that should cleanup all this code.

>
> Hmm... Wouldn't it be better to change cpu_stop_queue_work() consider
> whether the local CPU is offline and then add cpu_stop_wait_done()
> which does wait_for_completion() if local is online and otherwise
> execute fn locally, call cpu_stop_signal_done() and wait in busy loop?
>
> That would make the whole thing much more generic and easier to
> describe. The current implementation seems quite hacky/subtle and
> doesn't fit well with the rest. It would be much better if we can
> just state "if used from local CPU which is not online and the target
> @cpumask includes the local CPU, the work item is executed on-stack
> and completion is waited in busy-loop" for all cpu_stop functions.

Did these changes in v3.

>
> Also, it would be better to factor out work item execution and
> completion from cpu_stopper_thread() and call that instead of invoking
> fn(arg) directly.
>

I was not sure if it was the right thing to call cpu_stopper_thread()
which was doing set_current_state() etc from an arbitrary context. I
don't see the point. Please review v3 and let me know if I missed
anything.

thanks,
suresh
---

From: Suresh Siddha <[email protected]>
To: [email protected]
Subject: [stable only 2.6.35 - 2.6.39] x86, mtrr: lock stop machine during MTRR rendezvous sequence

MTRR rendezvous seqeuence using stop_one_cpu_nowait() can potentially
happen in parallel with another system wide rendezvous using
stop_machine(). This can lead to deadlock (The order in which
works are queued can be different on different cpu's. Some cpu's
will be running the first rendezvous handler and others will be running
the second rendezvous handler. Each set waiting for the other set to join
for the system wide rendezvous, leading to a deadlock).

MTRR rendezvous sequence is not implemened using stop_machine() as this
gets called both from the process context aswell as the cpu online paths
(where the cpu has not come online and the interrupts are disabled etc).
stop_machine() works with only online cpus.

For now, take the stop_machine mutex in the MTRR rendezvous sequence that
gets called from an online cpu (here we are in the process context
and can potentially sleep while taking the mutex). And the MTRR rendezvous
that gets triggered during cpu online doesn't need to take this stop_machine
lock (as the stop_machine() already ensures that there is no cpu hotplug
going on in parallel by doing get_online_cpus())

For mainline, we will pursue a cleaner solution of extending the stop_machine()
infrastructure to handle the case where the calling cpu is still not online and
use this for MTRR rendezvous sequence.

fixes: https://bugzilla.novell.com/show_bug.cgi?id=672008

Signed-off-by: Suresh Siddha <[email protected]>
Cc: [email protected] # v2.6.35 - v2.6.39
---
arch/x86/kernel/cpu/mtrr/main.c | 16 +++++++++++++++-
include/linux/stop_machine.h | 11 +++++++++++
kernel/stop_machine.c | 10 ++++++++++
3 files changed, 36 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a..eb06b5f 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -244,9 +244,21 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
static void
set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
{
+ int cpu = raw_smp_processor_id();
+ int online = cpu_online(cpu);
struct set_mtrr_data data;
unsigned long flags;
- int cpu;
+
+ /*
+ * If we are not yet online, then there can be no stop_machine() in
+ * parallel. Stop machine ensures that there is no CPU hotplug
+ * happening in parallel by using get_online_cpus().
+ *
+ * Otherwise, we need to prevent a stop_machine() happening in parallel
+ * by taking this lock.
+ */
+ if (online)
+ lock_stop_cpus();

preempt_disable();

@@ -330,6 +342,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ

local_irq_restore(flags);
preempt_enable();
+ if (online)
+ unlock_stop_cpus();
}

/**
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 092dc9b..caacea9 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -33,6 +33,9 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);

+void lock_stop_cpus(void);
+void unlock_stop_cpus(void);
+
#else /* CONFIG_SMP */

#include <linux/workqueue.h>
@@ -88,6 +91,14 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
return stop_cpus(cpumask, fn, arg);
}

+static inline void lock_stop_cpus(void)
+{
+}
+
+static inline void unlock_stop_cpus(void)
+{
+}
+
#endif /* CONFIG_SMP */

/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b2..abbe6e7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -136,6 +136,16 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
static DEFINE_MUTEX(stop_cpus_mutex);
static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);

+void lock_stop_cpus(void)
+{
+ mutex_lock(&stop_cpus_mutex);
+}
+
+void unlock_stop_cpus(void)
+{
+ mutex_unlock(&stop_cpus_mutex);
+}
+
int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_work *work;