2011-06-09 22:38:42

by Suresh Siddha

[permalink] [raw]
Subject: [patch v3 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

Currently stop machine infrastructure can be called only from a cpu that is
online. But for !CONFIG_SMP, we do allow calling __stop_machine() before the
cpu is online.

x86 for example requires stop machine infrastructure in the cpu online path
and currently implements its own stop machine (using stop_one_cpu_nowait())
for MTRR initialization in the cpu online path.

Enhance the __stop_machine() so that it can be called before the cpu
is onlined. This will pave the way for code consolidation and address potential
deadlocks caused by multiple mechanisms of doing system wide rendezvous.

This will also address the behavioral differences of __stop_machine()
between SMP and UP builds.

Also mark __stop_cpus() to be static, no one else uses it.

Signed-off-by: Suresh Siddha <[email protected]>
Cc: [email protected] # v2.6.35+
---
kernel/stop_machine.c | 58 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 49 insertions(+), 9 deletions(-)

Index: linux-2.6-tip/kernel/stop_machine.c
===================================================================
--- linux-2.6-tip.orig/kernel/stop_machine.c
+++ linux-2.6-tip/kernel/stop_machine.c
@@ -28,6 +28,7 @@
struct cpu_stop_done {
atomic_t nr_todo; /* nr left to execute */
bool executed; /* actually executed? */
+ bool offline_ctxt; /* stop_cpu from offline ctxt */
int ret; /* collected return value */
struct completion completion; /* fired if nr_todo reaches 0 */
};
@@ -47,15 +48,32 @@ static void cpu_stop_init_done(struct cp
memset(done, 0, sizeof(*done));
atomic_set(&done->nr_todo, nr_todo);
init_completion(&done->completion);
+ done->offline_ctxt = !percpu_read(cpu_stopper.enabled);
+}
+
+static inline void cpu_stop_wait_for_completion(struct cpu_stop_done *done)
+{
+ if (!done->offline_ctxt)
+ wait_for_completion(&done->completion);
+ else {
+ /*
+ * If the calling cpu is not online, then we can't afford to
+ * sleep, so poll till the work is completed on the target
+ * cpu's.
+ */
+ while (atomic_read(&done->nr_todo))
+ cpu_relax();
+ }
}

/* signal completion unless @done is NULL */
static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
{
if (done) {
+ bool offline_ctxt = done->offline_ctxt;
if (executed)
done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo))
+ if (atomic_dec_and_test(&done->nr_todo) && !offline_ctxt)
complete(&done->completion);
}
}
@@ -108,7 +126,7 @@ int stop_one_cpu(unsigned int cpu, cpu_s

cpu_stop_init_done(&done, 1);
cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
- wait_for_completion(&done.completion);
+ cpu_stop_wait_for_completion(&done);
return done.executed ? done.ret : -ENOENT;
}

@@ -136,20 +154,24 @@ void stop_one_cpu_nowait(unsigned int cp
static DEFINE_MUTEX(stop_cpus_mutex);
static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);

+static
int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
{
+ int online = percpu_read(cpu_stopper.enabled);
struct cpu_stop_work *work;
struct cpu_stop_done done;
+ unsigned int weight = 0;
unsigned int cpu;

/* initialize works and done */
- for_each_cpu(cpu, cpumask) {
+ for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
work = &per_cpu(stop_cpus_work, cpu);
work->fn = fn;
work->arg = arg;
work->done = &done;
+ weight++;
}
- cpu_stop_init_done(&done, cpumask_weight(cpumask));
+ cpu_stop_init_done(&done, weight);

/*
* Disable preemption while queueing to avoid getting
@@ -157,12 +179,19 @@ int __stop_cpus(const struct cpumask *cp
* to enter @fn which can lead to deadlock.
*/
preempt_disable();
- for_each_cpu(cpu, cpumask)
+ for_each_cpu_and(cpu, cpumask, cpu_online_mask)
cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
&per_cpu(stop_cpus_work, cpu));
+
+ /*
+ * This cpu is not yet online. If @fn needs to be run on this
+ * cpu, run it now.
+ */
+ if (!online && cpu_isset(smp_processor_id(), *cpumask))
+ fn(arg);
preempt_enable();

- wait_for_completion(&done.completion);
+ cpu_stop_wait_for_completion(&done);
return done.executed ? done.ret : -ENOENT;
}

@@ -431,6 +460,7 @@ static int stop_machine_cpu_stop(void *d
struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
int cpu = smp_processor_id(), err = 0;
+ unsigned long flags = 0;
bool is_active;

if (!smdata->active_cpus)
@@ -446,7 +476,7 @@ static int stop_machine_cpu_stop(void *d
curstate = smdata->state;
switch (curstate) {
case STOPMACHINE_DISABLE_IRQ:
- local_irq_disable();
+ local_irq_save(flags);
hard_irq_disable();
break;
case STOPMACHINE_RUN:
@@ -460,7 +490,7 @@ static int stop_machine_cpu_stop(void *d
}
} while (curstate != STOPMACHINE_EXIT);

- local_irq_enable();
+ local_irq_restore(flags);
return err;
}

@@ -470,9 +500,19 @@ int __stop_machine(int (*fn)(void *), vo
.num_threads = num_online_cpus(),
.active_cpus = cpus };

+ /* Include the calling cpu that might not be online yet. */
+ if (!percpu_read(cpu_stopper.enabled))
+ smdata.num_threads++;
+
/* Set the initial state and stop all online cpus. */
set_state(&smdata, STOPMACHINE_PREPARE);
- return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+
+ if (percpu_read(cpu_stopper.enabled))
+ return stop_cpus(cpu_online_mask, stop_machine_cpu_stop,
+ &smdata);
+ else
+ return __stop_cpus(cpu_all_mask, stop_machine_cpu_stop,
+ &smdata);
}

int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)


2011-06-10 13:06:00

by Tejun Heo

[permalink] [raw]
Subject: Re: [patch v3 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

Hello, Suresh.

On Thu, Jun 09, 2011 at 03:32:12PM -0700, Suresh Siddha wrote:
> Index: linux-2.6-tip/kernel/stop_machine.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/stop_machine.c
> +++ linux-2.6-tip/kernel/stop_machine.c
> @@ -28,6 +28,7 @@
> struct cpu_stop_done {
> atomic_t nr_todo; /* nr left to execute */
> bool executed; /* actually executed? */
> + bool offline_ctxt; /* stop_cpu from offline ctxt */

Maybe something a bit more explicit is better? Say, from_offline_cpu?

> int ret; /* collected return value */
> struct completion completion; /* fired if nr_todo reaches 0 */
> };
> @@ -47,15 +48,32 @@ static void cpu_stop_init_done(struct cp
> memset(done, 0, sizeof(*done));
> atomic_set(&done->nr_todo, nr_todo);
> init_completion(&done->completion);
> + done->offline_ctxt = !percpu_read(cpu_stopper.enabled);
> +}

I'm not sure the above test is safe. CPU_ONLINE notification is not
guaranteed to execute on the CPU being brought online. It's likely to
happen on the CPU which is bring up the target CPU, so
cpu_stopper.enabled may change asynchronously. I think the correct
test to perform is querying local CPU onlineness with accompanying
WARN_ON_ONCE() on preemption enable state.

> +static inline void cpu_stop_wait_for_completion(struct cpu_stop_done *done)
> +{
> + if (!done->offline_ctxt)
> + wait_for_completion(&done->completion);

Missing {}'s.

> + else {

Shouldn't this function execute the function on local CPU?

> + /*
> + * If the calling cpu is not online, then we can't afford to
> + * sleep, so poll till the work is completed on the target
> + * cpu's.
> + */
> + while (atomic_read(&done->nr_todo))
> + cpu_relax();
> + }
> }
>
> /* signal completion unless @done is NULL */
> static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
> {
> if (done) {
> + bool offline_ctxt = done->offline_ctxt;
> if (executed)
> done->executed = true;
> - if (atomic_dec_and_test(&done->nr_todo))
> + if (atomic_dec_and_test(&done->nr_todo) && !offline_ctxt)
> complete(&done->completion);

What does the local variable achieve? Also, an extra space.

> +static
> int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)

Please put static on the same line.

> {
> + int online = percpu_read(cpu_stopper.enabled);
> struct cpu_stop_work *work;
> struct cpu_stop_done done;
> + unsigned int weight = 0;
> unsigned int cpu;
>
> /* initialize works and done */
> - for_each_cpu(cpu, cpumask) {
> + for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
> work = &per_cpu(stop_cpus_work, cpu);
> work->fn = fn;
> work->arg = arg;
> work->done = &done;
> + weight++;

This seems a bit dangerous. Upto now, whether to execute or not is
solely determined by testing stopper->enabled while holding
stopper->lock. There's no reason to change that behavior for this
feature and even if necessary it should be done in a separate patch
with ample explanation why that's necessary and why it's safe.

> preempt_disable();
> - for_each_cpu(cpu, cpumask)
> + for_each_cpu_and(cpu, cpumask, cpu_online_mask)
> cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
> &per_cpu(stop_cpus_work, cpu));

Ditto.

> + /*
> + * This cpu is not yet online. If @fn needs to be run on this
> + * cpu, run it now.
> + */
> + if (!online && cpu_isset(smp_processor_id(), *cpumask))
> + fn(arg);

Can't we put this in cpu_stop_wait_for_completion()? And please
factor out fn() execution from cpu_stopper_thread() and call that. I
know you objected to that in the other reply but please do it that
way. All that's necessary is using a different context and avoiding
sleeping. It's best to deviate only there. Please note that the
above change already breaks the return value.

> @@ -431,6 +460,7 @@ static int stop_machine_cpu_stop(void *d
> struct stop_machine_data *smdata = data;
> enum stopmachine_state curstate = STOPMACHINE_NONE;
> int cpu = smp_processor_id(), err = 0;
> + unsigned long flags = 0;
> bool is_active;
>
> if (!smdata->active_cpus)
> @@ -446,7 +476,7 @@ static int stop_machine_cpu_stop(void *d
> curstate = smdata->state;
> switch (curstate) {
> case STOPMACHINE_DISABLE_IRQ:
> - local_irq_disable();
> + local_irq_save(flags);
> hard_irq_disable();
> break;
> case STOPMACHINE_RUN:
> @@ -460,7 +490,7 @@ static int stop_machine_cpu_stop(void *d
> }
> } while (curstate != STOPMACHINE_EXIT);
>
> - local_irq_enable();
> + local_irq_restore(flags);

It would be nice to explain how the function may be called with irq
disabled. It would also be nice the special offline case explained in
the docbook comments as well.

> @@ -470,9 +500,19 @@ int __stop_machine(int (*fn)(void *), vo
> .num_threads = num_online_cpus(),
> .active_cpus = cpus };
>
> + /* Include the calling cpu that might not be online yet. */
> + if (!percpu_read(cpu_stopper.enabled))
> + smdata.num_threads++;
> +
> /* Set the initial state and stop all online cpus. */
> set_state(&smdata, STOPMACHINE_PREPARE);
> - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
> +
> + if (percpu_read(cpu_stopper.enabled))
> + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop,
> + &smdata);
> + else
> + return __stop_cpus(cpu_all_mask, stop_machine_cpu_stop,
> + &smdata);

Ummm... I'm completely lost here. How is this supposed to work? As
local CPU can't sleep, we skip synchronization altogether? What if
another stop machine is already in progress? Shouldn't you be busy
looping w/ trylock instead?

Thanks.

--
tejun

2011-06-13 17:58:20

by Suresh Siddha

[permalink] [raw]
Subject: Re: [patch v3 1/2] stop_machine: enable __stop_machine() to be called from the cpu online path

On Fri, 2011-06-10 at 06:05 -0700, Tejun Heo wrote:
> > @@ -47,15 +48,32 @@ static void cpu_stop_init_done(struct cp
> > memset(done, 0, sizeof(*done));
> > atomic_set(&done->nr_todo, nr_todo);
> > init_completion(&done->completion);
> > + done->offline_ctxt = !percpu_read(cpu_stopper.enabled);
> > +}
>
> I'm not sure the above test is safe. CPU_ONLINE notification is not
> guaranteed to execute on the CPU being brought online. It's likely to
> happen on the CPU which is bring up the target CPU, so
> cpu_stopper.enabled may change asynchronously. I think the correct
> test to perform is querying local CPU onlineness with accompanying
> WARN_ON_ONCE() on preemption enable state.

Thinking a bit more, I think using raw_smp_processor_id() is safe and
easier to understand. So I went back to cpu_online() checks here. We
don't need to worry about preemption state, as we are checking only if
the cpu is online/offline and there is no load balance that happens
between an online and offline cpu.

> > {
> > if (done) {
> > + bool offline_ctxt = done->offline_ctxt;
> > if (executed)
> > done->executed = true;
> > - if (atomic_dec_and_test(&done->nr_todo))
> > + if (atomic_dec_and_test(&done->nr_todo) && !offline_ctxt)
> > complete(&done->completion);
>
> What does the local variable achieve? Also, an extra space.

If the caller is polling on nr_todo, we need to check the offline status
before decrementing the nr_todo, as the callers stack holding 'done' may
no longer be active (as the caller can return as soon as he sees null
todo). Memory barrier in atomic_dec_and_test() ensures that the offline
status is read before.

> > /* initialize works and done */
> > - for_each_cpu(cpu, cpumask) {
> > + for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
> > work = &per_cpu(stop_cpus_work, cpu);
> > work->fn = fn;
> > work->arg = arg;
> > work->done = &done;
> > + weight++;
>
> This seems a bit dangerous. Upto now, whether to execute or not is
> solely determined by testing stopper->enabled while holding
> stopper->lock. There's no reason to change that behavior for this
> feature and even if necessary it should be done in a separate patch
> with ample explanation why that's necessary and why it's safe.

Making stop_cpus() to work in offline case requires a bit more work for
both CONFIG_SMP and !CONFIG_SMP cases. And there is no user for it
currently.

So in the new version, I made only __stop_machine() to be called from an
online path and no changes for stop_cpus() which can be called only from
a cpu that is already online.

If there is a need, we can make stop_cpus() also behave similarly in
future. But for now, only __stop_machine() is to be used from an online
path (used in the second patch by x86 mtrr code).

> > + /*
> > + * This cpu is not yet online. If @fn needs to be run on this
> > + * cpu, run it now.
> > + */
> > + if (!online && cpu_isset(smp_processor_id(), *cpumask))
> > + fn(arg);
>
> Can't we put this in cpu_stop_wait_for_completion()? And please
> factor out fn() execution from cpu_stopper_thread() and call that. I
> know you objected to that in the other reply but please do it that
> way. All that's necessary is using a different context and avoiding
> sleeping.

Calling cpu is not yet online (with irq's disabled etc), so it can't do
any context switch etc.

I fixed the return value checking you mentioned.

> > @@ -470,9 +500,19 @@ int __stop_machine(int (*fn)(void *), vo
> > .num_threads = num_online_cpus(),
> > .active_cpus = cpus };
> >
> > + /* Include the calling cpu that might not be online yet. */
> > + if (!percpu_read(cpu_stopper.enabled))
> > + smdata.num_threads++;
> > +
> > /* Set the initial state and stop all online cpus. */
> > set_state(&smdata, STOPMACHINE_PREPARE);
> > - return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
> > +
> > + if (percpu_read(cpu_stopper.enabled))
> > + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop,
> > + &smdata);
> > + else
> > + return __stop_cpus(cpu_all_mask, stop_machine_cpu_stop,
> > + &smdata);
>
> Ummm... I'm completely lost here. How is this supposed to work? As
> local CPU can't sleep, we skip synchronization altogether? What if
> another stop machine is already in progress?

There can't be another stop machine in parallel as stop_machine() does
get_online_cpus() and will be waiting for that if there is a cpu that is
coming online which does __stop_machine().

> Shouldn't you be busy
> looping w/ trylock instead?

Reviewing more closely, stop_cpus() can come in parallel to a
__stop_machine(). So I ended up busy looping with trylock in this path
aswell.

v4 patchset will follow shortly.

thanks,
suresh