2015-02-16 13:12:09

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

From: Thomas Gleixner <[email protected]>

Preeti reported a cpu down race with hrtimer based broadcasting:

Assume CPU1 is the CPU which holds the hrtimer broadcasting duty
before it is taken down.

CPU0 CPU1
cpu_down()
takedown_cpu()
disable_interrupts()
cpu_die()
while (CPU1 != DEAD) {
msleep(100);
switch_to_idle()
stop_cpu_timer()
schedule_broadcast()
}

tick_cleanup_dead_cpu()
take_over_broadcast()

So after CPU1 disabled interrupts it cannot handle the broadcast
hrtimer anymore, so CPU0 will be stuck forever.

Doing a "while (CPU1 != DEAD) msleep(100);" periodic poll is silly at
best, but we need to fix that nevertheless.

Split the tick cleanup into two pieces:

1) Shutdown and remove all per cpu clockevent devices from
takedown_cpu()

This is done carefully with respect to existing arch code which
works around the shortcoming of the clockevents core code in
interesting ways. We really want a separate callback for this to
cleanup the workarounds, but that's not scope of this patch

2) Takeover the broadcast duty explicitely before calling cpu_die()

This is a temporary workaround as well. What we really want is a
callback in the clockevent device which allows us to do that from
the dying CPU by pushing the hrtimer onto a different cpu. That
might involve an IPI and is definitely more complex than this
immediate fix.

Reported-by: Preeti U Murthy <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/tick.h | 9 +++++----
kernel/cpu.c | 6 +++---
kernel/time/clockevents.c | 30 ++++++++++++++++++------------
kernel/time/tick-broadcast.c | 32 ++++++++++++++++++++++----------
kernel/time/tick-common.c | 34 ++++++++++++----------------------
kernel/time/tick-internal.h | 6 +++---
6 files changed, 63 insertions(+), 54 deletions(-)

Index: linux/include/linux/tick.h
===================================================================
--- linux.orig/include/linux/tick.h
+++ linux/include/linux/tick.h
@@ -29,13 +29,12 @@ extern struct tick_device *tick_get_devi
extern void __init tick_init(void);
/* Should be core only, but XEN resume magic requires this */
extern void tick_resume_local(void);
-extern void tick_handover_do_timer(void);
-extern void tick_cleanup_dead_cpu(int cpu);
+/* CPU hotplug */
+extern void tick_shutdown_local(void);
#else /* CONFIG_GENERIC_CLOCKEVENTS */
static inline void tick_init(void) { }
static inline void tick_resume_local(void) { }
-static inline void tick_handover_do_timer(void) { }
-static inline void tick_cleanup_dead_cpu(int cpu) { }
+static inline void tick_shutdown_local(void) { }
#endif /* !CONFIG_GENERIC_CLOCKEVENTS */

#ifdef CONFIG_TICK_ONESHOT
@@ -66,8 +65,10 @@ static inline void tick_broadcast_contro

#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+extern void tick_takeover(int deadcpu);
#else
static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; }
+static inline void tick_takeover(int deadcpu) { }
#endif

static inline void tick_broadcast_enable(void)
Index: linux/kernel/cpu.c
===================================================================
--- linux.orig/kernel/cpu.c
+++ linux/kernel/cpu.c
@@ -349,8 +349,8 @@ static int __ref take_cpu_down(void *_pa
return err;

cpu_notify(CPU_DYING | param->mod, param->hcpu);
- /* Give up timekeeping duties */
- tick_handover_do_timer();
+ /* Shutdown the per cpu tick */
+ tick_shutdown_local();
/* Park the stopper thread */
kthread_park(current);
return 0;
@@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
__cpu_die(cpu);

/* CPU is completely dead: tell everyone. Too late to complain. */
- tick_cleanup_dead_cpu(cpu);
+ tick_takeover(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);

check_for_tasks(cpu);
Index: linux/kernel/time/clockevents.c
===================================================================
--- linux.orig/kernel/time/clockevents.c
+++ linux/kernel/time/clockevents.c
@@ -541,26 +541,32 @@ void clockevents_resume(void)
#endif

#ifdef CONFIG_HOTPLUG_CPU
-/**
- * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
+/*
+ * Cleanup the clock events devices on the dying cpu. curdev is the
+ * current installed tick device on that cpu
*/
-void tick_cleanup_dead_cpu(int cpu)
+void clockevents_cleanup_dying_cpu(struct clock_event_device *curdev)
{
struct clock_event_device *dev, *tmp;
unsigned long flags;
+ int cpu;

raw_spin_lock_irqsave(&clockevents_lock, flags);
-
- tick_shutdown(cpu);
- /*
- * Unregister the clock event devices which were
- * released from the users in the notify chain.
- */
- list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
- list_del(&dev->list);
+ if (!curdev)
+ goto cleanup;
/*
- * Now check whether the CPU has left unused per cpu devices
+ * We cannot call the set mode function here at the moment
+ * because existing architecture cpu down code shuts down
+ * stuff already and we cannot interfere with that. So we just
+ * set the mode to unused for now.
*/
+ curdev->mode = CLOCK_EVT_MODE_UNUSED;
+ list_del(&curdev->list);
+ module_put(curdev->owner);
+
+cleanup:
+ /* Remove the unused percpu devices from the list */
+ cpu = smp_processor_id();
list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
if (cpumask_test_cpu(cpu, dev->cpumask) &&
cpumask_weight(dev->cpumask) == 1 &&
Index: linux/kernel/time/tick-broadcast.c
===================================================================
--- linux.orig/kernel/time/tick-broadcast.c
+++ linux/kernel/time/tick-broadcast.c
@@ -421,15 +421,17 @@ void tick_set_periodic_handler(struct cl

#ifdef CONFIG_HOTPLUG_CPU
/*
- * Remove a CPU from broadcasting
+ * Remove a CPU from broadcasting. Called from the dying cpu.
*/
-void tick_shutdown_broadcast(unsigned int cpu)
+void tick_shutdown_broadcast(void)
{
struct clock_event_device *bc;
unsigned long flags;
+ int cpu;

raw_spin_lock_irqsave(&tick_broadcast_lock, flags);

+ cpu = smp_processor_id();
cpumask_clear_cpu(cpu, tick_broadcast_mask);
cpumask_clear_cpu(cpu, tick_broadcast_on);

@@ -906,14 +908,26 @@ void tick_broadcast_switch_to_oneshot(vo
}

#ifdef CONFIG_HOTPLUG_CPU
-static void broadcast_move_bc(int deadcpu)
+/*
+ * Called from the cpu hotplug code after a cpu is dead. This ensures
+ * that a hrtimer based broad cast device is taken over.
+ *
+ * FIXME: This should go away. We should replace this by a mechanism
+ * which pushes the hrtimer over to a different cpu from
+ * tick_shutdown_broadcast_oneshot()
+ */
+void tick_broadcast_takeover_bc(int deadcpu)
{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ struct clock_event_device *bc;
+ unsigned long flags;

- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}

/*
@@ -929,8 +943,6 @@ static void tick_shutdown_broadcast_ones
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
- broadcast_move_bc(cpu);
}
#endif

Index: linux/kernel/time/tick-common.c
===================================================================
--- linux.orig/kernel/time/tick-common.c
+++ linux/kernel/time/tick-common.c
@@ -336,10 +336,10 @@ out_bc:
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled. Not locking required. If
- * tick_do_timer_cpu is owned by this cpu, nothing can change it.
+ * No locking required. If tick_do_timer_cpu is owned by this cpu,
+ * nothing can change it.
*/
-void tick_handover_do_timer(void)
+static void tick_handover_do_timer(void)
{
if (tick_do_timer_cpu == smp_processor_id()) {
int cpu = cpumask_first(cpu_online_mask);
@@ -349,32 +349,22 @@ void tick_handover_do_timer(void)
}
}

-/*
- * Shutdown an event device on a given cpu:
+/**
+ * tick_shutdown_local - Shutdown the tick related functions on a cpu
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * This is called from the dying cpu.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown_local(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
- struct clock_event_device *dev = td->evtdev;
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);

/* Remove the CPU from the broadcast machinery */
- tick_shutdown_broadcast(cpu);
+ tick_shutdown_broadcast();

+ clockevents_cleanup_dying_cpu(td->evtdev);
td->mode = TICKDEV_MODE_PERIODIC;
- if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- dev->mode = CLOCK_EVT_MODE_UNUSED;
- clockevents_exchange_device(dev, NULL);
- dev->event_handler = clockevents_handle_noop;
- td->evtdev = NULL;
- }
+
+ tick_handover_do_timer();
}
#endif

Index: linux/kernel/time/tick-internal.h
===================================================================
--- linux.orig/kernel/time/tick-internal.h
+++ linux/kernel/time/tick-internal.h
@@ -20,7 +20,6 @@ extern int tick_do_timer_cpu __read_most
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
-extern void tick_shutdown(unsigned int cpu);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
@@ -38,6 +37,7 @@ extern void clockevents_shutdown(struct
extern void clockevents_exchange_device(struct clock_event_device *old,
struct clock_event_device *new);
extern void clockevents_handle_noop(struct clock_event_device *dev);
+extern void clockevents_cleanup_dying_cpu(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern void clockevents_suspend(void);
extern void clockevents_resume(void);
@@ -82,7 +82,7 @@ static inline int tick_check_oneshot_cha
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
+extern void tick_shutdown_broadcast(void);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
extern bool tick_resume_check_broadcast(void);
@@ -96,7 +96,7 @@ static inline void tick_install_broadcas
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
+static inline void tick_shutdown_broadcast(void) { }
static inline void tick_suspend_broadcast(void) { }
static inline void tick_resume_broadcast(void) { }
static inline bool tick_resume_check_broadcast(void) { return false; }


2015-02-17 04:04:50

by Preeti U Murthy

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On 02/16/2015 05:45 PM, Peter Zijlstra wrote:

>> From: Thomas Gleixner <[email protected]>

>> @@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
>> __cpu_die(cpu);
>>
>> /* CPU is completely dead: tell everyone. Too late to complain. */
>>- tick_cleanup_dead_cpu(cpu);
>>+ tick_takeover(cpu);

Why is tick_handover() called after __cpu_die()? And the function tick_takeover()
is not introduced until the next patch. tick_broadcast_takeover_bc() is the
function used instead in this patch.

Regards
Preeti U Murthy

2015-02-17 10:39:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Tue, Feb 17, 2015 at 09:33:45AM +0530, Preeti U Murthy wrote:
> On 02/16/2015 05:45 PM, Peter Zijlstra wrote:
>
> >> From: Thomas Gleixner <[email protected]>
>
> >> @@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
> >> __cpu_die(cpu);
> >>
> >> /* CPU is completely dead: tell everyone. Too late to complain. */
> >>- tick_cleanup_dead_cpu(cpu);
> >>+ tick_takeover(cpu);
>
> Why is tick_handover() called after __cpu_die()?

See: [PATCH 11/35] clockevents: Cleanup dead cpu explicitely
it used to be a CPU_DEAD notifier.

But, I think, the actual reason would be that you cannot be sure its not
still ticking until its actually proper dead and buried, so trying to
take over a tick from a cpu that's still ticking is... well, suspect.

> And the function tick_takeover()
> is not introduced until the next patch. tick_broadcast_takeover_bc() is the
> function used instead in this patch.

Indeed so; let me correct that for bisection's sake.

2015-02-18 03:11:49

by Preeti U Murthy

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting



On 02/17/2015 04:09 PM, Peter Zijlstra wrote:
>> [PATCH 11/35] clockevents: Cleanup dead cpu explicitely
n Tue, Feb 17, 2015 at 09:33:45AM +0530, Preeti U Murthy wrote:
>> On 02/16/2015 05:45 PM, Peter Zijlstra wrote:
>>
>>>> From: Thomas Gleixner <[email protected]>
>>
>>>> @@ -428,7 +428,7 @@ static int __ref _cpu_down(unsigned int
>>>> __cpu_die(cpu);
>>>>
>>>> /* CPU is completely dead: tell everyone. Too late to complain. */
>>>> - tick_cleanup_dead_cpu(cpu);
>>>> + tick_takeover(cpu);
>>
>> Why is tick_handover() called after __cpu_die()?

> See: [PATCH 11/35] clockevents: Cleanup dead cpu explicitely
> it used to be a CPU_DEAD notifier.

> But, I think, the actual reason would be that you cannot be sure its not
> still ticking until its actually proper dead and buried, so trying to
> take over a tick from a cpu that's still ticking is... well, suspect.

Look at the changelog, it explains why tick_takeover must be called
*before* __cpu_die(). The above hunk is not solving the issue, it is
equivalent to the scenario where we took over broadcast duty in the
CPU_DEAD phase as a consequence of which the cpu doing the hotplug
operation is stuck sleeping forever. Cleanup functions can be handled
after __cpu_die() for the reason that you mention above.

Regards
Preeti U Murthy

2015-02-18 13:06:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Wed, Feb 18, 2015 at 08:40:52AM +0530, Preeti U Murthy wrote:
>
> Look at the changelog,

Heh, yah, clearly I tl;dr'd that. Indeed.

> it explains why tick_takeover must be called
> *before* __cpu_die().

Since you reported this, can you test if things work if you move that
function call to before __cpu_die() ?

I _think_ it should be fine, but what do I know ;-)

2015-02-19 07:03:02

by Preeti U Murthy

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On 02/18/2015 06:36 PM, Peter Zijlstra wrote:
> On Wed, Feb 18, 2015 at 08:40:52AM +0530, Preeti U Murthy wrote:
>>
>> Look at the changelog,
>
> Heh, yah, clearly I tl;dr'd that. Indeed.
>
>> it explains why tick_takeover must be called
>> *before* __cpu_die().
>
> Since you reported this, can you test if things work if you move that
> function call to before __cpu_die() ?

Yes it does. I had tested and posted out a patch to fix the issue
here:https://lkml.org/lkml/2015/1/30/821. I see now that tglx proposes
to get rid of "taking over" tick duties going forward, rather "handing
over" tick duties would be better. So the patch in the above link would
not be acceptable. But minus the taking over of the do_timer, the cpu
doing the hotplug takes over broadcast duty before __cpu_die() in this
patch. This design works.

Regards
Preeti U Murthy
>
> I _think_ it should be fine, but what do I know ;-)
>

2015-02-19 09:53:54

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Thu, Feb 19, 2015 at 12:31:53PM +0530, Preeti U Murthy wrote:
> On 02/18/2015 06:36 PM, Peter Zijlstra wrote:
> > On Wed, Feb 18, 2015 at 08:40:52AM +0530, Preeti U Murthy wrote:
> >>
> >> Look at the changelog,
> >
> > Heh, yah, clearly I tl;dr'd that. Indeed.
> >
> >> it explains why tick_takeover must be called
> >> *before* __cpu_die().
> >
> > Since you reported this, can you test if things work if you move that
> > function call to before __cpu_die() ?
>
> Yes it does. I had tested and posted out a patch to fix the issue
> here:https://lkml.org/lkml/2015/1/30/821. I see now that tglx proposes
> to get rid of "taking over" tick duties going forward, rather "handing
> over" tick duties would be better. So the patch in the above link would
> not be acceptable. But minus the taking over of the do_timer, the cpu
> doing the hotplug takes over broadcast duty before __cpu_die() in this
> patch. This design works.

Great, /me amends patch.

2015-02-19 17:51:55

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Mon, 16 Feb 2015, Peter Zijlstra wrote:

> From: Thomas Gleixner <[email protected]>
>
> Preeti reported a cpu down race with hrtimer based broadcasting:
>
> Assume CPU1 is the CPU which holds the hrtimer broadcasting duty
> before it is taken down.
>
> CPU0 CPU1
> cpu_down()
> takedown_cpu()
> disable_interrupts()
> cpu_die()
> while (CPU1 != DEAD) {
> msleep(100);
> switch_to_idle()
> stop_cpu_timer()
> schedule_broadcast()
> }
>
> tick_cleanup_dead_cpu()
> take_over_broadcast()
>
> So after CPU1 disabled interrupts it cannot handle the broadcast
> hrtimer anymore, so CPU0 will be stuck forever.
>
> Doing a "while (CPU1 != DEAD) msleep(100);" periodic poll is silly at
> best, but we need to fix that nevertheless.
>
> Split the tick cleanup into two pieces:
>
> 1) Shutdown and remove all per cpu clockevent devices from
> takedown_cpu()
>
> This is done carefully with respect to existing arch code which
> works around the shortcoming of the clockevents core code in
> interesting ways. We really want a separate callback for this to
> cleanup the workarounds, but that's not scope of this patch
>
> 2) Takeover the broadcast duty explicitely before calling cpu_die()
>
> This is a temporary workaround as well. What we really want is a
> callback in the clockevent device which allows us to do that from
> the dying CPU by pushing the hrtimer onto a different cpu. That
> might involve an IPI and is definitely more complex than this
> immediate fix.
>
> Reported-by: Preeti U Murthy <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>

This breaks the b.L switcher disabling code which essentially does:

static void bL_switcher_restore_cpus(void)
{
int i;

for_each_cpu(i, &bL_switcher_removed_logical_cpus) {
struct device *cpu_dev = get_cpu_device(i);
int ret = device_online(cpu_dev);
if (ret)
dev_err(cpu_dev, "switcher: unable to restore CPU\n");
}
}

However, as soon as one new CPU becomes online, the following crash
occurs on that CPU:

[ 547.858031] ------------[ cut here ]------------
[ 547.871868] kernel BUG at kernel/time/hrtimer.c:1249!
[ 547.886991] Internal error: Oops - BUG: 0 [#1] SMP THUMB2
[ 547.903155] Modules linked in:
[ 547.912303] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 3.19.0-rc5-00058-gdd7a65fbc5 #527
[...]
[ 548.599060] [<c005a1c2>] (hrtimer_interrupt) from [<c00639db>] (tick_do_broadcast.constprop.8+0x8f/0x90)
[ 548.627482] [<c00639db>] (tick_do_broadcast.constprop.8) from [<c0063acd>] (tick_handle_oneshot_broadcast+0xf1/0x168)
[ 548.659290] [<c0063acd>] (tick_handle_oneshot_broadcast) from [<c001a07f>] (sp804_timer_interrupt+0x2b/0x30)
[ 548.688755] [<c001a07f>] (sp804_timer_interrupt) from [<c004ee9b>] (handle_irq_event_percpu+0x37/0x130)
[ 548.716916] [<c004ee9b>] (handle_irq_event_percpu) from [<c004efc7>] (handle_irq_event+0x33/0x48)
[ 548.743511] [<c004efc7>] (handle_irq_event) from [<c0050c1d>] (handle_fasteoi_irq+0x69/0xe4)
[ 548.768804] [<c0050c1d>] (handle_fasteoi_irq) from [<c004e835>] (generic_handle_irq+0x1d/0x28)
[ 548.794619] [<c004e835>] (generic_handle_irq) from [<c004ea17>] (__handle_domain_irq+0x3f/0x80)
[ 548.820694] [<c004ea17>] (__handle_domain_irq) from [<c00084f5>] (gic_handle_irq+0x21/0x4c)
[ 548.845729] [<c00084f5>] (gic_handle_irq) from [<c04521db>] (__irq_svc+0x3b/0x5c)

The corresponding code is:

void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;

BUG_ON(!cpu_base->hres_active);
[...]

Reverting this patch "fixes" the problem.


Nicolas

2015-02-21 12:47:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Thu, Feb 19, 2015 at 12:51:52PM -0500, Nicolas Pitre wrote:
>
> This breaks the b.L switcher disabling code which essentially does:
>
> static void bL_switcher_restore_cpus(void)
> {
> int i;
>
> for_each_cpu(i, &bL_switcher_removed_logical_cpus) {
> struct device *cpu_dev = get_cpu_device(i);
> int ret = device_online(cpu_dev);
> if (ret)
> dev_err(cpu_dev, "switcher: unable to restore CPU\n");
> }
> }

Just so I understand, this device_{on,ofF}line() stuff is basically just
cpu_{up,down}() but obfuscated through the device model nonsense, right?

Also it seems bL_switcher_enable() relies on lock_device_hotplug() to
stabilize the online cpu mask; it does not, only the hotplug lock does.

I'm having a very hard time trying to follow wth this thing all is
doing; its using hotplug but its also doing magic with cpu_suspend().

/me confused..

2015-02-21 17:45:54

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Sat, 21 Feb 2015, Peter Zijlstra wrote:

> On Thu, Feb 19, 2015 at 12:51:52PM -0500, Nicolas Pitre wrote:
> >
> > This breaks the b.L switcher disabling code which essentially does:
> >
> > static void bL_switcher_restore_cpus(void)
> > {
> > int i;
> >
> > for_each_cpu(i, &bL_switcher_removed_logical_cpus) {
> > struct device *cpu_dev = get_cpu_device(i);
> > int ret = device_online(cpu_dev);
> > if (ret)
> > dev_err(cpu_dev, "switcher: unable to restore CPU\n");
> > }
> > }
>
> Just so I understand, this device_{on,ofF}line() stuff is basically just
> cpu_{up,down}() but obfuscated through the device model nonsense, right?

Right. Before commit 3f8517e793 cpu_{up,down}() were used directly.

> Also it seems bL_switcher_enable() relies on lock_device_hotplug() to
> stabilize the online cpu mask; it does not, only the hotplug lock does.

This is there to prevent any concurrent hotplug operations via sysfs.
Once the switcher is active, we deny hotplugging operations on any
physical CPU the switcher has removed.

> I'm having a very hard time trying to follow wth this thing all is
> doing; its using hotplug but its also doing magic with cpu_suspend().

You're fully aware of the on-going work on the scheduler to better
support the big.LITTLE architecture amongst other things. The switcher
is an interim solution where one big CPU is paired with one little CPU,
and that pair is conceptually used as one logical CPU where only one of
the big or little physical CPU runs at a time. Those logical CPUs have
identical capacities therefore the current scheduler may works well with
them.

The switch between the two physical CPUs is abstracted behind an
extended cpufreq scale i.e. when cpufreq asks for a frequency exceeding
the little CPU then a transparent switch is made to the big CPU. The
transparent switch is performed by suspending the current CPU and
immediately resuming the same context on the other CPU, hence the
cpu_suspend() usage.

The switcher is runtime activated. To do so, one physical CPU per
logical pairing is hotplugged out so the system considers only the
"logical" CPUs. When the switcher is disabled, those CPUs are brought
back online.

> /me confused..

If you want more background info, you may have a look at this article:

http://lwn.net/Articles/481055/

Or just ask away.


Nicolas

2015-02-23 16:15:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Sat, Feb 21, 2015 at 12:45:51PM -0500, Nicolas Pitre wrote:
> > I'm having a very hard time trying to follow wth this thing all is
> > doing; its using hotplug but its also doing magic with cpu_suspend().
>
> You're fully aware of the on-going work on the scheduler to better
> support the big.LITTLE architecture amongst other things. The switcher
> is an interim solution where one big CPU is paired with one little CPU,
> and that pair is conceptually used as one logical CPU where only one of
> the big or little physical CPU runs at a time. Those logical CPUs have
> identical capacities therefore the current scheduler may works well with
> them.

Yeah, I'm aware of roughly what and why its doing things.

But I was trying to piece together how exactly stuff gets done so I
could figure out why that patch broke stuff.

So while it appears to use the regular hotplug code for some of the
work, it also has some extra magic on top to affect the actual
switcheroo of the cpu.


In any case, having had a second look I think I might have some ideas:

- bL_switcher_enable() -- enables the whole switcher thing and
disables half the cpus with hot-un-plug, creates a mapping etc..

- bL_switcher_disable() -- disabled the whole switcher thing and
gives us back all our cpus with hot-plug.

When the switcher is enabled; we switch by this magic cpu_suspend() call
that saves the entire cpu state and allows you to restore it on another
cpu.

You muck about with the tick; you disable it before cpu_suspend() and
re-enable it after on the target cpu. You further reprogram the
interrupt routing from the old to the new cpu.

But that appears to be it, no more. I suppose the tick is special
because its the only per-cpu device?


The reported function that fails: bL_switcher_restore_cpus() is called
in the error paths of the former and the main path in the latter to make
the 'stolen' cpus re-appear.

The patch in question somehow makes that go boom.


Now what all do you need to do to make it go boom? Just enable/disable
the switcher once and it'll explode? Or does it need to do actual
switches while it is enabled?

The place where it explodes is a bit surprising, it thinks hrtimers are
not enabled even though its calling into hrtimer code on that cpu...


2015-02-23 16:33:00

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Mon, 23 Feb 2015, Peter Zijlstra wrote:

> In any case, having had a second look I think I might have some ideas:
>
> - bL_switcher_enable() -- enables the whole switcher thing and
> disables half the cpus with hot-un-plug, creates a mapping etc..
>
> - bL_switcher_disable() -- disabled the whole switcher thing and
> gives us back all our cpus with hot-plug.
>
> When the switcher is enabled; we switch by this magic cpu_suspend() call
> that saves the entire cpu state and allows you to restore it on another
> cpu.
>
> You muck about with the tick; you disable it before cpu_suspend() and
> re-enable it after on the target cpu. You further reprogram the
> interrupt routing from the old to the new cpu.
>
> But that appears to be it, no more.

Exact.

> I suppose the tick is special because its the only per-cpu device?

Right.

> The reported function that fails: bL_switcher_restore_cpus() is called
> in the error paths of the former and the main path in the latter to make
> the 'stolen' cpus re-appear.
>
> The patch in question somehow makes that go boom.
>
>
> Now what all do you need to do to make it go boom? Just enable/disable
> the switcher once and it'll explode? Or does it need to do actual
> switches while it is enabled?

It gets automatically enabled during boot. Then several switches are
performed while user space is brought up. If I manually disable it
via /sys then it goes boom.

> The place where it explodes is a bit surprising, it thinks hrtimers are
> not enabled even though its calling into hrtimer code on that cpu...
>
>
>
>

2015-02-23 17:33:18

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On Mon, 23 Feb 2015, Nicolas Pitre wrote:

> On Mon, 23 Feb 2015, Peter Zijlstra wrote:
>
> > The reported function that fails: bL_switcher_restore_cpus() is called
> > in the error paths of the former and the main path in the latter to make
> > the 'stolen' cpus re-appear.
> >
> > The patch in question somehow makes that go boom.
> >
> >
> > Now what all do you need to do to make it go boom? Just enable/disable
> > the switcher once and it'll explode? Or does it need to do actual
> > switches while it is enabled?
>
> It gets automatically enabled during boot. Then several switches are
> performed while user space is brought up. If I manually disable it
> via /sys then it goes boom.

OK. Forget the bL switcher. I configured it out of my kernel and then
managed to get the same crash by simply hotplugging out one CPU and
plugging it back in.

$ echo 0 > /sys/devices/system/cpu/cpu2/online
[CPU2 gone]
$ echo 1 > /sys/devices/system/cpu/cpu2/online
[Boom!]


Nicolas

2015-02-26 05:32:10

by Preeti U Murthy

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On 02/23/2015 11:03 PM, Nicolas Pitre wrote:
> On Mon, 23 Feb 2015, Nicolas Pitre wrote:
>
>> On Mon, 23 Feb 2015, Peter Zijlstra wrote:
>>
>>> The reported function that fails: bL_switcher_restore_cpus() is called
>>> in the error paths of the former and the main path in the latter to make
>>> the 'stolen' cpus re-appear.
>>>
>>> The patch in question somehow makes that go boom.
>>>
>>>
>>> Now what all do you need to do to make it go boom? Just enable/disable
>>> the switcher once and it'll explode? Or does it need to do actual
>>> switches while it is enabled?
>>
>> It gets automatically enabled during boot. Then several switches are
>> performed while user space is brought up. If I manually disable it
>> via /sys then it goes boom.
>
> OK. Forget the bL switcher. I configured it out of my kernel and then
> managed to get the same crash by simply hotplugging out one CPU and
> plugging it back in.
>
> $ echo 0 > /sys/devices/system/cpu/cpu2/online
> [CPU2 gone]
> $ echo 1 > /sys/devices/system/cpu/cpu2/online
> [Boom!]
>
>
I saw an issue with this patch as well. I tried to do an smt mode switch
on a power machine, i.e varying the number of hyperthreads on an SMT 8
system, and the system hangs. Worse, there are no softlockup
messages/warnings/bug_ons reported. I am digging into this issue.

A couple of points though. Looking at the patch, I see that we are
shutting down tick device of the hotplugged out cpu *much before*
migrating the timers and hrtimers from it. Migration of timers is done
in the CPU_DEAD phase, while we shutdown tick devices in the CPU_DYING
phase. There is quite a bit of a gap here. Earlier we would do both in a
single notification.

Another point is that the tick devices are shutdown before the
hotplugged out cpu actually dies in __cpu_die(). At first look none of
these two points should create any issues. But since we are noticing
problems with this patch, I thought it would be best to put them forth.

But why are tick devices being shutdown that early ? Is there any
specific advantage to this? Taking/handing over tick duties should be
done before __cpu_die(), but shutdown of tick devices should be done
after this phase. This seems more natural, doesn't it?


Regards
Preeti U Murthy

> Nicolas
>

2015-02-27 08:49:15

by Preeti U Murthy

[permalink] [raw]
Subject: Re: [PATCH 32/35] clockevents: Fix cpu down race for hrtimer based broadcasting

On 02/26/2015 11:01 AM, Preeti U Murthy wrote:
> On 02/23/2015 11:03 PM, Nicolas Pitre wrote:
>> On Mon, 23 Feb 2015, Nicolas Pitre wrote:
>>
>>> On Mon, 23 Feb 2015, Peter Zijlstra wrote:
>>>
>>>> The reported function that fails: bL_switcher_restore_cpus() is called
>>>> in the error paths of the former and the main path in the latter to make
>>>> the 'stolen' cpus re-appear.
>>>>
>>>> The patch in question somehow makes that go boom.
>>>>
>>>>
>>>> Now what all do you need to do to make it go boom? Just enable/disable
>>>> the switcher once and it'll explode? Or does it need to do actual
>>>> switches while it is enabled?
>>>
>>> It gets automatically enabled during boot. Then several switches are
>>> performed while user space is brought up. If I manually disable it
>>> via /sys then it goes boom.
>>
>> OK. Forget the bL switcher. I configured it out of my kernel and then
>> managed to get the same crash by simply hotplugging out one CPU and
>> plugging it back in.
>>
>> $ echo 0 > /sys/devices/system/cpu/cpu2/online
>> [CPU2 gone]
>> $ echo 1 > /sys/devices/system/cpu/cpu2/online
>> [Boom!]
>>
>>
> I saw an issue with this patch as well. I tried to do an smt mode switch
> on a power machine, i.e varying the number of hyperthreads on an SMT 8
> system, and the system hangs. Worse, there are no softlockup
> messages/warnings/bug_ons reported. I am digging into this issue.
>
> A couple of points though. Looking at the patch, I see that we are
> shutting down tick device of the hotplugged out cpu *much before*
> migrating the timers and hrtimers from it. Migration of timers is done
> in the CPU_DEAD phase, while we shutdown tick devices in the CPU_DYING
> phase. There is quite a bit of a gap here. Earlier we would do both in a
> single notification.
>
> Another point is that the tick devices are shutdown before the
> hotplugged out cpu actually dies in __cpu_die(). At first look none of
> these two points should create any issues. But since we are noticing
> problems with this patch, I thought it would be best to put them forth.
>
> But why are tick devices being shutdown that early ? Is there any
> specific advantage to this? Taking/handing over tick duties should be
> done before __cpu_die(), but shutdown of tick devices should be done
> after this phase. This seems more natural, doesn't it?

The problem reported in the changelog of this patch is causing severe
regressions very frequently on our machines for certain usecases. It would
help to put in a fix in place first and then follow that up with these
cleanups. A fix on the below lines :

---------------------------------------------------------------------

clockevents: Fix cpu down race for hrtimer based broadcasting

---
include/linux/tick.h | 10 +++++++---
kernel/cpu.c | 2 ++
kernel/time/tick-broadcast.c | 19 +++++++++++--------
3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index eda850c..8735776 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -91,14 +91,18 @@ extern void tick_cancel_sched_timer(int cpu);
static inline void tick_cancel_sched_timer(int cpu) { }
# endif

-# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+# if defined CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern struct tick_device *tick_get_broadcast_device(void);
extern struct cpumask *tick_get_broadcast_mask(void);

-# ifdef CONFIG_TICK_ONESHOT
+# if defined CONFIG_TICK_ONESHOT
extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
+extern void tick_takeover(int deadcpu);
+# else
+static void tick_takeover(int deadcpu) {}
# endif
-
+# else
+static void tick_takeover(int deadcpu) {}
# endif /* BROADCAST */

# ifdef CONFIG_TICK_ONESHOT
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b16..f9ca351 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <linux/tick.h>
#include <trace/events/power.h>

#include "smpboot.h"
@@ -411,6 +412,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
while (!idle_cpu(cpu))
cpu_relax();

+ tick_takeover(cpu);
/* This actually kills the CPU. */
__cpu_die(cpu);

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 066f0ec..0fd6634 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -669,14 +669,19 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
}

-static void broadcast_move_bc(int deadcpu)
+void tick_takeover(int deadcpu)
{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ struct clock_event_device *bc;
+ unsigned long flags;

- if (!bc || !broadcast_needs_cpu(bc, deadcpu))
- return;
- /* This moves the broadcast assignment to this cpu */
- clockevents_program_event(bc, bc->next_event, 1);
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+ bc = tick_broadcast_device.evtdev;
+
+ if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /* This moves the broadcast assignment to this cpu */
+ clockevents_program_event(bc, bc->next_event, 1);
+ }
+ raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}

/*
@@ -913,8 +918,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);

- broadcast_move_bc(cpu);
-
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}

Regards
Preeti U Murthy

>
>
> Regards
> Preeti U Murthy
>
>> Nicolas
>>